In [131]:
import requests
import xml.etree.ElementTree as ET
import csv
import time
import json
import os
import pickle

In [132]:
BASE_URL = "https://data.rijksmuseum.nl/oai"
METADATA_PREFIX = "edm"

ns = {
    'oai': 'http://www.openarchives.org/OAI/2.0/',
    'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
    'dc': 'http://purl.org/dc/elements/1.1/',
    'dcterms': 'http://purl.org/dc/terms/',
    'edm': 'http://www.europeana.eu/schemas/edm/',
    'ore': 'http://www.openarchives.org/ore/terms/',
    'skos': 'http://www.w3.org/2004/02/skos/core#'
}

In [None]:
ident_file = "rijksmuseum_identifiers.json"
all_identifiers = []

# Resume if file exists
if os.path.exists(ident_file):
    with open(ident_file, "r", encoding="utf-8") as f:
        all_identifiers = json.load(f)
    print(f"Loaded {len(all_identifiers)} identifiers.")
else:
    print("No identifier file found. Starting fresh.")

resumption_token = None
page = 0

try:
    while True:
        if resumption_token:
            url = f"{BASE_URL}?verb=ListIdentifiers&resumptionToken={resumption_token}"
        else:
            url = f"{BASE_URL}?verb=ListIdentifiers&metadataPrefix={METADATA_PREFIX}"

        response = requests.get(url)
        root = ET.fromstring(response.content)

        headers = root.findall(".//oai:header", ns)
        for h in headers:
            if h.attrib.get("status") != "deleted":
                identifier = h.find("oai:identifier", ns).text
                if identifier not in all_identifiers:
                    all_identifiers.append(identifier)

        with open(ident_file, "w", encoding="utf-8") as f:
            json.dump(all_identifiers, f, ensure_ascii=False, indent=2)

        page += 1
        print(f"Saved page {page}, total identifiers: {len(all_identifiers)}")

        token_elem = root.find(".//oai:resumptionToken", ns)
        if token_elem is not None and token_elem.text:
            resumption_token = token_elem.text
            time.sleep(0.5)
        else:
            break

except KeyboardInterrupt:
    print("\nInterrupted â€” identifiers saved safely.")

No identifier file found. Starting fresh.
Saved page 1, total identifiers: 50
Saved page 2, total identifiers: 100
Saved page 3, total identifiers: 150
Saved page 4, total identifiers: 190
Saved page 5, total identifiers: 233
Saved page 6, total identifiers: 283
Saved page 7, total identifiers: 333
Saved page 8, total identifiers: 383
Saved page 9, total identifiers: 425
Saved page 10, total identifiers: 475
Saved page 11, total identifiers: 525
Saved page 12, total identifiers: 575
Saved page 13, total identifiers: 625
Saved page 14, total identifiers: 675
Saved page 15, total identifiers: 725
Saved page 16, total identifiers: 775
Saved page 17, total identifiers: 825
Saved page 18, total identifiers: 875
Saved page 19, total identifiers: 925
Saved page 20, total identifiers: 975
Saved page 21, total identifiers: 1025
Saved page 22, total identifiers: 1075
Saved page 23, total identifiers: 1125
Saved page 24, total identifiers: 1175
Saved page 25, total identifiers: 1225
Saved page 26