In [113]:
import os
import json
import pickle
import time
import requests
import xml.etree.ElementTree as ET
import csv

In [114]:
BASE_URL = "https://data.rijksmuseum.nl/oai"
METADATA_PREFIX = "edm"

IDENT_FILE = "rijksmuseum_identifiers.json"
PICKLE_FILE = "rijksmuseum_metadata.pkl"
CSV_FILE = "rijksmuseum_metadata.csv"
IMAGE_DIR = "rijksmuseum_images"
REQUEST_SLEEP = 1.0

os.makedirs(IMAGE_DIR, exist_ok=True)

In [115]:
ns = {
    'oai': 'http://www.openarchives.org/OAI/2.0/',
    'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
    'dc': 'http://purl.org/dc/elements/1.1/',
    'dcterms': 'http://purl.org/dc/terms/',
    'edm': 'http://www.europeana.eu/schemas/edm/',
    'skos': 'http://www.w3.org/2004/02/skos/core#',
    'ns4': 'http://www.europeana.eu/schemas/edm/',
    'ns6': 'http://purl.org/dc/terms/',
    'ns7': 'http://id.loc.gov/vocabulary/relators/',
    'ns8': 'http://rdfs.org/sioc/services#',
    'ns9': 'http://www.w3.org/2004/02/skos/core#',
    'xml': 'http://www.w3.org/XML/1998/namespace'
}

In [116]:
def download_image(url, identifier=None):
    if not url:
        return None
    ext = url.split(".")[-1]
    if identifier:
        filename = f"{identifier.replace('/', '_')}.{ext}"
    else:
        filename = url.split("/")[-1]
    filepath = os.path.join(IMAGE_DIR, filename)
    if os.path.exists(filepath):
        return filepath
    try:
        r = requests.get(url, stream=True, timeout=30)
        if r.status_code == 200:
            with open(filepath, "wb") as f:
                for chunk in r.iter_content(1024):
                    f.write(chunk)
            return filepath
    except Exception as e:
        print(f"Failed to download {url}: {e}")
    return None

In [117]:
def fetch_metadata(identifier):
    url = f"{BASE_URL}?verb=GetRecord&metadataPrefix=edm&identifier={identifier}"
    r = requests.get(url, timeout=30)
    root = ET.fromstring(r.content)

    metadata = root.find(".//ns4:ProvidedCHO", ns)
    if metadata is None:
        return None

    def get_text(tag):
        "Get text from a dc/ns6 tag, prefer nl then en."
        elems = metadata.findall(tag, ns)
        for e in elems:
            if e.attrib.get("{http://www.w3.org/XML/1998/namespace}lang") == "nl":
                return e.text
        for e in elems:
            if e.attrib.get("{http://www.w3.org/XML/1998/namespace}lang") == "en":
                return e.text
        return elems[0].text if elems else None

    def get_all_resources(tag):
        "Return all rdf:resource URLs for a given tag."
        return [e.attrib.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource")
                for e in metadata.findall(tag, ns)]

    def resolve_label(url_val):
        "Given a resource URL, find its human-readable label (prefLabel in Dutch if possible)."
        if not url_val:
            return None
        node = root.find(f".//*[@rdf:about='{url_val}']", ns)
        if node is None:
            return url_val
        # Prefer Dutch prefLabel, fallback to English
        label = node.find("ns9:prefLabel[@xml:lang='nl']", ns)
        if label is None:
            label = node.find("ns9:prefLabel[@xml:lang='en']", ns)
        if label is None:
            return url_val
        return label.text

    # Artist(s)
    artist_urls = get_all_resources("dc:creator")
    artists = [resolve_label(a) for a in artist_urls]
    artist_str = ", ".join(artists)

    # Object type
    obj_type = None
    type_elem = metadata.find("dc:type", ns)
    if type_elem is not None:
        url_val = type_elem.attrib.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource")
        obj_type = resolve_label(url_val)

    # Materials / techniques
    material_urls = get_all_resources("dc:format")
    materials = [resolve_label(m) for m in material_urls if m] if material_urls else []
    materials_str = ", ".join(materials) if materials else get_text("dc:format")

    # Subjects
    subject_urls = get_all_resources("dc:subject")
    subjects = [resolve_label(s) for s in subject_urls if s]
    subjects_str = ", ".join(subjects)

    # Rights
    rights_nodes = metadata.findall("dc:rights", ns)
    rights_list = []
    for r in rights_nodes:
        if r.text and r.text.strip():
            rights_list.append(r.text.strip())
        elif r.attrib.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource"):
            rights_list.append(resolve_label(r.attrib.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource")))
    rights_str = ", ".join(rights_list) if rights_list else None

    # Museum department / collection
    departments = []
    for part in metadata.findall("dcterms:isPartOf", ns):
        url_val = part.attrib.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource")
        label = resolve_label(url_val)
        if label:
            departments.append(label)
    museum_department = ", ".join(departments)

    # Image URL
    image_elem = root.find(".//ns4:object", ns)
    if image_elem is not None:
        image_url = image_elem.attrib.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource")
    else:
        web_res = root.find(".//ns4:isShownBy/ns4:WebResource", ns)
        image_url = web_res.attrib.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about") if web_res is not None else None

    # Download image
    image_file = download_image(image_url, metadata.attrib.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about"))

    return {
        "identifier": metadata.attrib.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about"),
        "title_nl": get_text("dc:title"),
        "alternative_titles_nl": get_text("ns6:alternative"),
        "description_nl": get_text("dc:description"),
        "artist": artist_str,
        "dating": get_text("ns6:created"),
        "dimensions": get_text("ns6:extent"),
        "object_type": obj_type,
        "materials_techniques": materials_str,
        "subjects": subjects_str,
        "museum_department": museum_department,
        "rights": rights_str,
        "image_url": image_url,
        "image_file": image_file
    }

In [None]:
with open(IDENT_FILE, "r", encoding="utf-8") as f:
    all_identifiers = json.load(f)

if os.path.exists(PICKLE_FILE):
    with open(PICKLE_FILE, "rb") as f:
        all_records = pickle.load(f)
    print(f"Loaded {len(all_records)} records from pickle.")
else:
    all_records = []

processed_ids = {r["identifier"] for r in all_records}

try:
    for i, identifier in enumerate(all_identifiers):
        if identifier in processed_ids:
            continue

        record = fetch_metadata(identifier)
        if record:
            all_records.append(record)
            processed_ids.add(record["identifier"])

        if (i + 1) % 50 == 0 or (i + 1) == len(all_identifiers):
            print(f"Processed {i+1}/{len(all_identifiers)} records.")
            # Save pickle
            with open(PICKLE_FILE, "wb") as f:
                pickle.dump(all_records, f)
            # Save CSV
            keys = all_records[0].keys()
            with open(CSV_FILE, "w", newline="", encoding="utf-8") as f:
                writer = csv.DictWriter(f, fieldnames=keys)
                writer.writeheader()
                writer.writerows(all_records)

        time.sleep(REQUEST_SLEEP)

except KeyboardInterrupt:
    print("\nInterrupted. Progress saved.")

print(f"Done! Total records: {len(all_records)}")

Processed 50/31216 records.
Processed 100/31216 records.


In [None]:
import requests
import xml.etree.ElementTree as ET

record_id = "https://id.rijksmuseum.nl/200107928"
url = f"{BASE_URL}?verb=GetRecord&metadataPrefix=edm&identifier={record_id}"
r = requests.get(url)
root = ET.fromstring(r.content)

print(ET.tostring(root, encoding="unicode"))

<ns0:OAI-PMH xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:ns0="http://www.openarchives.org/OAI/2.0/" xmlns:ns10="http://www.w3.org/2002/07/owl#" xmlns:ns11="http://rdvocab.info/ElementsGr2/" xmlns:ns12="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:ns3="http://www.openarchives.org/ore/terms/" xmlns:ns4="http://www.europeana.eu/schemas/edm/" xmlns:ns6="http://purl.org/dc/terms/" xmlns:ns7="http://id.loc.gov/vocabulary/relators/" xmlns:ns8="http://rdfs.org/sioc/services#" xmlns:ns9="http://www.w3.org/2004/02/skos/core#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd">
  <ns0:responseDate>2026-01-13T06:57:57Z</ns0:responseDate>
  <ns0:request metadataPrefix="edm" identifier="https://id.rijksmuseum.nl/200107928">https://data.rijksmuseum.nl/oai</ns0:request>
  <ns0:GetRecord>
    <ns0:record>
      <ns0:header>
       