In [3]:
import json
import re

import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm

FORCE_REFRESH_IMAGE_SIZES = False

In [4]:
s = requests.Session()

retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])

s.mount("https://", HTTPAdapter(max_retries=retries))

In [12]:
out_of_bounds_regex = re.compile(
    r"500 Internal Server Error\n\nIndex \d+ out of bounds for length (\d+)\n\n\n"
)


def get_pdf_num_pages(pdf_iiif_base_url):
    response = requests.get(pdf_iiif_base_url + ";999999999/info.json")
    assert response.status_code == 500
    bounds_match = out_of_bounds_regex.match(response.text)
    assert bounds_match is not None
    assert len(bounds_match.groups()) == 1
    return int(bounds_match.group(1))


image_sizes_cache = {}


def get_image_sizes(pdf_iiif_base_url, page_number):
    query_url = f"{pdf_iiif_base_url};{page_number}/info.json"
    if not FORCE_REFRESH_IMAGE_SIZES and query_url in image_sizes_cache:
        return image_sizes_cache[query_url]
    response = s.get(query_url)
    response.raise_for_status()
    sizes = (response.json()["height"], response.json()["width"])
    image_sizes_cache[query_url] = sizes
    return sizes


def generate_iiif_image_items(base_manifest_uri, document_name):
    pdf_iiif_base_url = (
        f"https://media.performing-arts.ch/iiif/3/document%2f{document_name}"
    )
    num_pages = get_pdf_num_pages(pdf_iiif_base_url)
    items = []
    for page_number in tqdm(range(1, num_pages + 1)):
        height, width = get_image_sizes(pdf_iiif_base_url, page_number)
        items.append(
            {
                "id": f"{base_manifest_uri}/canvas/p{page_number}",
                "type": "Canvas",
                "height": height,
                "width": width,
                "label": {
                    "en": [
                        f"Page {page_number}"
                    ],
                    "de": [
                        f"Seite {page_number}"
                    ],
                    "fr": [
                        f"Page {page_number}"
                    ],
                    "it": [
                        f"Pagina {page_number}"
                    ],
                       
                },
                #"thumbnail": [
                #    {
                #        "id": f"{pdf_iiif_base_url};{page_number}/full/!300,300/0/default.jpg",
                #        "type": "Image",
                #        "format": "image/jpeg",
                #        "width": int(width*300/height),
                #        "height": 300,
                #    }
                #],
                "items": [
                    {
                        "id": f"{base_manifest_uri}/page/p{page_number}/1",
                        "type": "AnnotationPage",
                        "items": [
                            {
                                "id": f"{base_manifest_uri}/annotation/p{page_number:04d}-image",
                                "type": "Annotation",
                                "motivation": "painting",
                                "body": {
                                    "id": f"{pdf_iiif_base_url};{page_number}/full/max/0/default.jpg",
                                    "type": "Image",
                                    "format": "image/jpeg",
                                    "height": height,
                                    "width": width,
                                    "service": [
                                        {
                                            "id": f"{pdf_iiif_base_url};{page_number}",
                                            "type": "ImageService3",
                                            "profile": "level1",
                                        }
                                    ],
                                },
                                "target": f"{base_manifest_uri}/canvas/p{page_number}",
                            }
                        ],
                    }
                ],
            }
        )
    return items

In [13]:
document_name = "SAPA_Jahresbericht_2022_FR-1"


In [14]:
iiif_items = generate_iiif_image_items(f"https://media.performing-arts.ch/iiif/manifest/{document_name}", document_name)

  0%|          | 0/48 [00:00<?, ?it/s]

In [15]:
pdf_iiif_base_url = (
    f"https://media.performing-arts.ch/iiif/3/document%2f{document_name}"
)

manifest = {
    "@context": "http://iiif.io/api/presentation/3/context.json",
    "id": f"https://media.performing-arts.ch/iiif/manifest/{document_name}.json",
    "label": {"en": [document_name]},
    "type": "Manifest",
    "behavior": ["paged"],
    "viewingDirection": "left-to-right",
    "rendering": [
        {
            "id": f"{pdf_iiif_base_url}/full/max/0/default.jpg",
            "type": "Document",
            "label": {"de": "PDF", "fr": "PDF", "en": "PDF"},
            "format": "application/pdf",
        }
    ],
    "items": generate_iiif_image_items(
        f"https://media.performing-arts.ch/iiif/manifest/{document_name}", document_name
    ),
}

  0%|          | 0/48 [00:00<?, ?it/s]

In [16]:
with open(f'{document_name}.json', 'w') as outfile:
    json.dump(manifest, outfile, ensure_ascii=False)