* #307

In [3]:
import time

from datetime import datetime

now = datetime.now()
timestamp = now.timestamp()

start_time = time.time()
print("Start:", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

Start: 2025-11-09 20:48:33


In [183]:
import requests
import pandas as pd
from urllib.parse import quote

# --- Free license mapping for Wikimedia Commons ---
FREE_LICENSES = {
    "CC by": "cc-by-4.0",
    "CC BY": "cc-by-4.0",
    "CC BY-SA": "cc-by-sa-4.0",
    "CC by-sa": "cc-by-sa-4.0",
    "CC0": "cc-zero",
    "Public Domain": "cc-zero",
    "CC pdm" : "cc-zero",
}


from IPython.display import HTML

import pandas as pd
from IPython.display import display, HTML
import ipywidgets as widgets

def show_dimu_results(df, per_page=30, thumb_width=200):
    """Interactive viewer for DigitaltMuseum results with pagination."""
    df = df.copy()
    df["Thumb"] = df["image_url"].apply(
        lambda x: f'<img src="{x}" width="{thumb_width}">' if x else ""
    )
    df["Museum"] = df["museum_link"].apply(
        lambda x: f'<a href="{x}" target="_blank">üîó DigitaltMuseum</a>'
    )
    df["Commons"] = df["commons_upload"].apply(
        lambda x: f'<a href="{x}" target="_blank">‚¨ÜÔ∏è Upload to Commons</a>'
    )

    total_pages = (len(df) - 1) // per_page + 1

    # Output area to update when page changes
    output = widgets.Output()

    def render_page(page):
        """Render a specific page."""
        start, end = (page - 1) * per_page, page * per_page
        df_page = df.iloc[start:end]
        html = df_page[
            ["title", "Thumb", "photographer", "license", "Museum", "Commons"]
        ].to_html(escape=False, index=False)
        with output:
            output.clear_output(wait=True)
            display(HTML(
                f"<b>Page {page}/{total_pages} "
                f"({len(df)} total items)</b><br>"
                f'<div style="max-height:600px;overflow-y:auto">{html}</div>'
            ))

    # Buttons for navigation
    prev_button = widgets.Button(description="‚¨ÖÔ∏è Previous", disabled=True)
    next_button = widgets.Button(description="Next ‚û°Ô∏è")

    page_state = {"page": 1}

    def on_prev_click(b):
        if page_state["page"] > 1:
            page_state["page"] -= 1
            render_page(page_state["page"])
        update_buttons()

    def on_next_click(b):
        if page_state["page"] < total_pages:
            page_state["page"] += 1
            render_page(page_state["page"])
        update_buttons()

    def update_buttons():
        prev_button.disabled = page_state["page"] <= 1
        next_button.disabled = page_state["page"] >= total_pages

    prev_button.on_click(on_prev_click)
    next_button.on_click(on_next_click)

    # Initial render
    render_page(1)
    display(widgets.HBox([prev_button, next_button]))
    display(output)

# --- Helper: construct proper fdms01 image URL ---
def dimu_image_url(media_id, dimension="800x800", host="fdms01", filename=None):
    """Return a working HTTPS image URL for DigitaltMuseum media."""
    url = f"https://{host}.dimu.org/image/{media_id}?dimension={dimension}"

    if filename:
        url += f"&filename={filename}.jpg"
    return url


# --- Main function ---
def search_digitaltmuseum_all(query, chunk=1000, max_results=None):
    """
    Retrieve all paginated DigitaltMuseum results matching `query`.
    Returns DataFrame filtered to free-license images (CC BY, CC BY-SA, CC0, PD).
    Each row includes a Commons-ready upload link.
    """
    url = "https://api.dimu.org/api/solr/select"
    start = 0
    total_found = None
    all_rows = []

    while True:
        params = {
            "q": query,
            "fl": (
                "identifier.id,identifier.owner,"
                "artifact.uniqueId,artifact.ingress.title,"
                "artifact.ingress.description,artifact.ingress.license,"
                "artifact.ingress.producer,artifact.ingress.production.fromYear,"
                "artifact.ingress.production.toYear,artifact.ingress.production.place,"
                "artifact.defaultMediaIdentifier"
            ),
            "rows": chunk,
            "start": start,
            "wt": "json",
            "api.key": "demo",
        }

        r = requests.get(url, params=params)
        
        r.raise_for_status()
        data = r.json()["response"]

        if total_found is None:
            total_found = data["numFound"]
            print(f"üîé Found {total_found} hits for '{query}'")

        docs = data.get("docs", [])

        if not docs:
            break

        for d in docs:
            licenses = d.get("artifact.ingress.license", [])
            license_value = licenses[0] if licenses else None
            if license_value not in FREE_LICENSES:
                continue
            
            media_id = d.get("artifact.defaultMediaIdentifier")
            if not media_id:
                continue

            # Build image and Commons data
            #image_url = dimu_image_url(media_id, dimension="800x800", filename=media_id)
            image_url = f"https://dms01.dimu.org/image/{media_id}"

            commons_license = FREE_LICENSES[license_value]
            title = d.get("artifact.ingress.title", "Untitled")
            museum_link = f"https://digitaltmuseum.se/{d.get('artifact.uniqueId')}"

            desc_sv = f"{title}, uppladdat fr√•n DigitaltMuseum"
            desc_en = f"{title}, uploaded from DigitaltMuseum"

            # Commons upload description
            summary = f"""{{{{Information
|description={{{{sv|{desc_sv}}}}}{{{{en|{desc_en}}}}}
|source={museum_link}
|author={d.get('artifact.ingress.producer', 'ok√§nd')}
|permission=
|other versions=
}}}}

== Licensing ==
{{license_value}}

[[Category:Digitalt Museum]]
[[Category:SAT Digitalt Museum]]
"""

            commons_upload_url = (
                "https://commons.wikimedia.org/wiki/Special:Upload?"
                f"wpUploadDescription={quote(summary)}"
                f"&wpLicense={commons_license}"
                f"&wpDestFile={quote(title.replace(' ', '_') + '.jpg')}"
                f"&wpSourceType=url"
                f"&wpUploadFileURL={quote(image_url)}"
            )

            row = {
                "title": title,
                "photographer": d.get("artifact.ingress.producer"),
                "year_from": d.get("artifact.ingress.production.fromYear"),
                "place": d.get("artifact.ingress.production.place"),
                "owner": d.get("identifier.owner"),
                "license": license_value,
                "commons_license": commons_license,
                "image_url": image_url,
                "museum_link": museum_link,
                "commons_upload": commons_upload_url,
            }

            all_rows.append(row)

        start += chunk
        if start >= total_found:
            break
        if max_results and len(all_rows) >= max_results:
            break

    df = pd.DataFrame(all_rows)
    print(f"‚úÖ {len(df)} free-license images collected.")
    return df


In [180]:
from IPython.display import HTML

df["museum_link"] = df["museum_link"].apply(
    lambda x: f'<a href="{x}" target="_blank">üîó DigitaltMuseum</a>'
)


In [197]:
#df = search_digitaltmuseum_all("Ut√∂", chunk=200)
#df = search_digitaltmuseum_all("Ut√∂ kvarn", chunk=200)
#df = search_digitaltmuseum_all("N√•ttar√∂", chunk=200)
#df = search_digitaltmuseum_all("Landsort v√§stra", chunk=200)
#df = search_digitaltmuseum_all("Vy fr√•n lotsutkiken", chunk=200)
#df = search_digitaltmuseum_all("Orn√∂ kyrka", chunk=200)
#df = search_digitaltmuseum_all("Orn√∂ kyrka", chunk=200)
#df = search_digitaltmuseum_all("Arholma kapell", chunk=100)
df = search_digitaltmuseum_all("Tullverkets fd tv√§ttbod.", chunk=100)

show_dimu_results(df)



üîé Found 1 hits for 'Tullverkets fd tv√§ttbod.'
‚úÖ 1 free-license images collected.


HBox(children=(Button(description='‚¨ÖÔ∏è Previous', disabled=True, style=ButtonStyle()), Button(description='Next‚Ä¶

Output()

In [146]:
from IPython.display import HTML

df["museum_link"] = df["museum_link"].apply(
    lambda x: f'<a href="{x}" target="_blank">üîó DigitaltMuseum</a>'
)

HTML(df[["title","license","Thumb", "museum_link"]].to_html(escape=False, index=False))


KeyError: "['Thumb'] not in index"

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86 entries, 0 to 85
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   title            86 non-null     object 
 1   photographer     85 non-null     object 
 2   year_from        78 non-null     float64
 3   place            72 non-null     object 
 4   owner            86 non-null     object 
 5   license          86 non-null     object 
 6   commons_license  86 non-null     object 
 7   image_url        86 non-null     object 
 8   museum_link      86 non-null     object 
 9   commons_upload   86 non-null     object 
 10  Thumb            86 non-null     object 
dtypes: float64(1), object(10)
memory usage: 7.5+ KB


In [52]:
df["image_url"].head(3).tolist()

['https://fdms01.dimu.org/image/019EGGiCYB6iM?dimension=800x800&filename=019EGGiCYB6iM.jpg',
 'https://fdms01.dimu.org/image/019EE8iWCRf7o?dimension=800x800&filename=019EE8iWCRf7o.jpg',
 'https://fdms01.dimu.org/image/019EE8iWCRf7q?dimension=800x800&filename=019EE8iWCRf7q.jpg']

In [21]:
 # End timer and calculate duration
end_time = time.time()
elapsed_time = end_time - start_time# Bygg audit-lager f√∂r den h√§r etappen

# Print current date and total time
print("Date:", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
minutes, seconds = divmod(elapsed_time, 60)
print("Total time elapsed: {:02.0f} minutes {:05.2f} seconds".format(minutes, seconds))


Date: 2025-11-09 21:07:32
Total time elapsed: 18 minutes 58.53 seconds
