Get metadata from SAPA platform.


In [1]:
%load_ext autoreload
%autoreload 2
import json
import os
import sys
from getpass import getpass
from urllib.parse import quote

import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from SPARQLWrapper import JSON, SPARQLWrapper
from tqdm.auto import tqdm
from urllib3.util.retry import Retry

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

try:
    S3_CLIENT_ID
except NameError:
    S3_CLIENT_ID = getpass("Please input s3 client id")

try:
    S3_CLIENT_SECRET
except NameError:
    S3_CLIENT_SECRET = getpass("Please input s3 client secret")


tqdm.pandas()

In [5]:
sparql = SPARQLWrapper("https://www.performing-arts.ch/sparql")

sparql.setQuery(
    """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
PREFIX spav: <http://vocab.performing-arts.ch/>
PREFIX rico: <https://www.ica.org/standards/RiC/ontology#>
#SELECT ?resource ?identifier_value ?descriptive_note ?name WHERE {
SELECT * WHERE {
  ?serie rico:isOrWasIncludedIn <http://data.performing-arts.ch/r/9636965d-b8e7-4009-950a-8a1f39f89456>.
  ?dossier rico:isOrWasIncludedIn ?serie.
  ?resource rico:isOrWasIncludedIn ?dossier ;
  	rico:hasOrHadIdentifier ?identifier;
    rico:descriptiveNote ?descriptive_note;
    rico:name ?name.
  ?identifier rdf:value ?identifier_value;
      crm:P2_has_type spav:iduni.
} ORDER BY ?identifier_value
"""
)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

metadata = pd.DataFrame.from_dict(
    [{k: v["value"] for k, v in x.items()} for x in results["results"]["bindings"]]
)

We try to fetch image sizes from the IIIF. We cache those results for usage next time this script is run, set `force_compute_image_sizes` to `True` if you want to force recompute.
You can run this cell as much as needed, as the values are cached, so running it twice may also prevent some errors.


In [6]:
def identifier_to_image_base_url(identifier):
    return f"https://media.performing-arts.ch/iiif/3/image%2F{identifier}-DC"

In [7]:
force_compute_images_size = False

if not force_compute_images_size and os.path.exists("iiif_image_sizes.csv"):
    identifier2image_size = (
        pd.read_csv("iiif_image_sizes.csv", index_col=0, dtype={'identifier': str, "width": int, "height": int})
        .apply(lambda row: (row["width"], row["height"]), axis=1)
        .to_dict()
    )
else:
    identifier2image_size = {}

s = requests.Session()

retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])

s.mount("https://", HTTPAdapter(max_retries=retries))

for identifier in tqdm(
    metadata["identifier_value"], total=len(metadata)
):
    iiif_url = identifier_to_image_base_url(identifier) + "/info.json"
    if identifier in identifier2image_size:
        continue
    try:
        res = s.get(iiif_url)
        if res.status_code == 404:
            print(f"Error with {identifier}, URL {iiif_url} does not exists")
            continue
        res.raise_for_status()
        res_json = res.json()
        identifier2image_size[identifier] = (int(res_json["width"]), int(res_json["height"]))
    except Exception as e:
        print(f"Error with {identifier}, got the following error with {iiif_url}: ", e)

pd.DataFrame.from_dict(identifier2image_size, orient="index").rename_axis(
    "identifier"
).rename(columns={0: "width", 1: "height"}).to_csv("iiif_image_sizes.csv")

  0%|          | 0/1004 [00:00<?, ?it/s]

Error with 1553-FO-360-2, URL https://media.performing-arts.ch/iiif/3/image%2F1553-FO-360-2-DC/info.json does not exists
Error with 1553-FO-601-1, URL https://media.performing-arts.ch/iiif/3/image%2F1553-FO-601-1-DC/info.json does not exists
Error with 1553-FO-73-2, URL https://media.performing-arts.ch/iiif/3/image%2F1553-FO-73-2-DC/info.json does not exists
Error with 1553-FO-742-2, URL https://media.performing-arts.ch/iiif/3/image%2F1553-FO-742-2-DC/info.json does not exists


Verify that all images match a record and vice versa:

In [8]:
# Check that all images have a record
images_no_record = set(identifier2image_size.keys()).difference(
    metadata["identifier_value"]
)
if len(images_no_record) != 0:
    print("Some images do not have a record: ", images_no_record)

# Check that all records have an image
records_no_image = set(metadata["identifier_value"].values).difference(
    identifier2image_size.keys()
)
if len(records_no_image) != 0:
    print("Some records do not have an image: ", records_no_image)

Some records do not have an image:  {'1553-FO-742-2', '1553-FO-73-2', '1553-FO-360-2', '1553-FO-601-1'}


Remove items that do not have a IIIF value:


In [9]:
metadata = metadata.loc[metadata["identifier_value"].isin(identifier2image_size.keys())].copy()
metadata["width_height"] = metadata["identifier_value"].apply(identifier2image_size.get)

Create IIIF mapping


In [10]:
def identifier_to_base_url(identifier):
    return "https://media.performing-arts.ch/iiif/manifest/" + quote(identifier)


base_metadata = {
    "@context": "http://iiif.io/api/presentation/3/context.json",
    "type": "Manifest",
    "rights": "http://creativecommons.org/licenses/by-sa/4.0/",
    "requiredStatement": {
        "label": {"en": ["Copyright"]},
        "value": {"en": ["Fred Erismann"]},
    },
    "homepage": [
        {
            "id": "https://sapa.swiss",
            "type": "Text",
            "label": {"en": ["SAPA Homepage"]},
            "format": "text/html",
        }
    ],
    "provider": [
        {
            "id": "https://www.wikidata.org/entity/Q50920401",
            "type": "Agent",
            "label": {
                "en": ["SAPA, Swiss Archive of the Performing Arts"],
                "de": ["Stiftung SAPA, Schweizer Archiv der Darstellenden Künste"],
                "fr": ["Fondation SAPA, Archives suisses des arts de la scène"],
                "it": ["Fondazione SAPA, Archivio svizzero delle arti della scena"],
            },
            "homepage": [
                {
                    "id": "https://sapa.swiss/",
                    "type": "Text",
                    "label": {
                        "en": [
                            "The SAPA Foundation, Swiss Archive of the Performing Arts, collects documents and objects of importance to the history of the performing arts and makes them accessible to a wider audience."
                        ],
                        "de": [
                            "Die Stiftung SAPA, Schweizer Archiv der Darstellenden Künste, sammelt Dokumente und Objekte, die für die Geschichte der Darstellenden Künste bedeutsam sind, und stellt diese einem breiten Publikum zur Verfügung."
                        ],
                        "fr": [
                            "La Fondation SAPA, Archives suisses des arts de la scène, collecte et met à disposition de tous les publics les documents et objets constituant l‘histoire des arts de la scène en Suisse. Sa mission: préserver les traces de ces arts éphémères et complexes pour les transmettre aux générations futures."
                        ],
                        "it": [
                            "SAPA raccoglie e mette a disposizione del pubblico documenti e oggetti di rilevanza storica per le arti sceniche in Svizzera. La Fondazione si pone l’obiettivo di preservare le tracce di queste arti effimere e complesse per tramandarle alle generazioni future."
                        ],
                    },
                    "format": "text/html",
                }
            ],
            "logo": [
                {
                    "id": "https://memobase.ch/sites/default/files/2021-05/sap-logo.jpg",
                    "type": "Image",
                    "format": "image/jpeg",
                    "height": 100,
                    "width": 260,
                }
            ],
        }
    ],
    "viewingDirection": "left-to-right",
}

In [11]:
from helpers.iiif import IIIFImageItem, create_manifest_from_iiif_images


def metadata_to_manifest(row):
    identifier = row["identifier_value"]
    base_url = identifier_to_base_url(identifier)
    iiif_url = identifier_to_image_base_url(identifier)
    width, height = identifier2image_size[identifier]

    return create_manifest_from_iiif_images(
        base_url,
        [IIIFImageItem(iiif_url, width, height)],
        base_metadata,
        label=row["name"],
        summary=row["name"],
        sapa_resource=row["resource"],
        identifier=identifier,
        description=row["descriptive_note"],
        creator="Fred Erismann",
    )
    width, height = row["width_height"]
    canvas_url = f"{base_url}/p1"
    # return base_metadata | {
    # "id": f"{base_url}.json",
    # "label": {
    # "de": [row["name"]],
    # },
    # "summary": {"en": [row["name"]]},
    # "seeAlso": [
    # {
    # "id": row["resource"],
    # "type": "Text",
    # "label": {"en": ["Record on Swiss performing arts platform"]},
    # "format": "text/html",
    # }
    # ],
    # "metadata": [
    # {
    # "label": {
    # "en": ["Identifier"],
    # "de": ["Signatur"],
    # "fr": ["Cote"],
    # "it": ["Segnatura"],
    # },
    # "value": {
    # "en": [identifier],
    # "de": [identifier],
    # "fr": [identifier],
    # "it": [identifier],
    # },
    # },
    # {
    # "label": {
    # "en": ["Description"],
    # "de": ["Beschreibung"],
    # "fr": ["Description"],
    # "it": ["Descrizione"],
    # },
    # "value": {
    # "en": [row["descriptive_note"]],
    # "de": [row["descriptive_note"]],
    # "fr": [row["descriptive_note"]],
    # "it": [row["descriptive_note"]],
    # },
    # },
    # {
    # "label": {
    # "en": ["Creator"],
    # "de": ["Urheber"],
    # "fr": ["Auteur"],
    # "it": ["Autore"],
    # },
    # "value": {
    # "en": ["Fred Erismann"],
    # "de": ["Fred Erismann"],
    # "fr": ["Fred Erismann"],
    # "it": ["Fred Erismann"],
    # },
    # },
    # ],
    # "thumbnail": [
    # {
    # "id": f"{iiif_url}/full/80,/0/default.jpg",
    # "type": "Image",
    # "format": "image/jpeg",
    # "service": [
    # {"id": iiif_url, "type": "ImageService3", "profile": "level2"}
    # ],
    # }
    # ],
    # "items": [
    # {
    # "id": canvas_url,
    # "type": "Canvas",
    # "height": int(height),
    # "width": int(width),
    # "rendering": [
    # {
    # "id": f"{iiif_url}/full/max/0/default.jpg",
    # "type": "Image",
    # "label": {"de": "Bild", "fr": "Image", "en": "Picture"},
    # "format": "image/jpeg",
    # }
    # ],
    # "items": [
    # {
    # "id": f"{canvas_url}/1",
    # "type": "AnnotationPage",
    # "items": [
    # {
    # "id": f"{base_url}/annotation/p0001-image",
    # "type": "Annotation",
    # "motivation": "painting",
    # "body": {
    # "id": f"{iiif_url}/full/max/0/default.jpg",
    # "type": "Image",
    # "format": "image/jpeg",
    # "height": int(height),
    # "width": int(width),
    # "service": [
    # {
    # "id": iiif_url,
    # "profile": "level1",
    # "type": "ImageService3",
    # }
    # ],
    # },
    # "target": f"{canvas_url}/1",
    # }
    # ],
    # }
    # ],
    # }
    # ],
    # }

Comment out to write manifests to files (only needed for debug or uploading elsewhere):

In [10]:
os.makedirs('manifests', exist_ok=True)
for _, item in tqdm(metadata.iterrows(), total=len(metadata)):
   with open(
       os.path.join("manifests", item["identifier_value"] + ".json"), "w"
   ) as outfile:
       outfile.write(json.dumps(metadata_to_manifest(item), ensure_ascii=False))

  0%|          | 0/1000 [00:00<?, ?it/s]

Write manifests directly to s3


In [12]:
from helpers.s3 import S3Client


client = S3Client(S3_CLIENT_ID, S3_CLIENT_SECRET)

for _, item in tqdm(metadata.iterrows(), total=len(metadata)):
    client.upload_as_json(metadata_to_manifest(item), "manifests/" + item["identifier_value"] + ".json")

  0%|          | 0/1000 [00:00<?, ?it/s]