In [1]:
import polars as pl
import requests
from tqdm.notebook import tqdm
from urllib.parse import urlparse

sparql_url = "https://query.wikidata.org/sparql"

In [2]:
sparql_query = """
SELECT ?item ?itemLabel ?coord ?depicts ?depictsLabel ?dod ?image ?countryLabel WHERE {
  # Q659396 is equestrian statue
  {
    SELECT DISTINCT ?item
    WHERE {
      { ?item wdt:P136 wd:Q659396 }
      UNION
      { ?item wdt:P31  wd:Q659396 }
    }
  }

  # 2) Join the heavy bits after narrowing to one page of items
  ?item wdt:P625 ?coord .
  ?item wdt:P180 ?depicts .
  FILTER NOT EXISTS { ?depicts wdt:P31 wd:Q726 }
  ?depicts wdt:P570 ?dod .
  OPTIONAL { ?item wdt:P17 ?country }
  OPTIONAL { ?item wdt:P18 ?image }
  
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en".
  }
}
"""

In [3]:
headers = {"Accept": "application/sparql-results+json"}
response = requests.get(sparql_url, params={"query": sparql_query}, headers=headers)
rows = response.json()["results"]["bindings"]
rows[0]

{'item': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q2183129'},
 'depicts': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q129987'},
 'dod': {'datatype': 'http://www.w3.org/2001/XMLSchema#dateTime',
  'type': 'literal',
  'value': '1702-03-19T00:00:00Z'},
 'coord': {'datatype': 'http://www.opengis.net/ont/geosparql#wktLiteral',
  'type': 'literal',
  'value': 'Point(4.776111 51.589722)'},
 'image': {'type': 'uri',
  'value': 'http://commons.wikimedia.org/wiki/Special:FilePath/Monument%20Willem%20III%2C%20Breda.jpg'},
 'itemLabel': {'xml:lang': 'en',
  'type': 'literal',
  'value': 'Equestrian statue of William III'},
 'depictsLabel': {'xml:lang': 'en',
  'type': 'literal',
  'value': 'William III of England'},
 'countryLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Netherlands'}}

In [4]:
def resolve_commons_filepath(filepath_url: str, timeout: float = 10.0) -> str:
    s = requests.Session()
    r = s.head(
        filepath_url,
        allow_redirects=True,
        timeout=timeout,
        headers={
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        },
    )
    r.raise_for_status()
    final_url = r.url
    if "upload.wikimedia.org" not in urlparse(final_url).netloc:
        r = s.get(filepath_url, allow_redirects=True, stream=True, timeout=timeout)
        r.raise_for_status()
        final_url = r.url
        r.close()
    if "upload.wikimedia.org" not in urlparse(final_url).netloc:
        raise RuntimeError("Could not resolve to an upload.wikimedia.org URL")
    return final_url


In [5]:
df = pl.DataFrame(rows)
fixed = df.select(
    qid=pl.col("item").struct["value"].str.split("/").list[-1],
    label=pl.col("itemLabel").struct["value"],
    image=pl.col("image")
    .struct["value"]
    .map_elements(resolve_commons_filepath, return_dtype=pl.String),
    depicted_qid=pl.col("depicts").struct["value"].str.split("/").list[-1],
    depicted=pl.col("depictsLabel").struct["value"],
    dod=pl.col("dod").struct["value"],
    country=pl.col("countryLabel").struct["value"],
    lat=pl.col("coord").struct["value"].str.extract(r" ([-\d\.]+)\)"),
    lon=pl.col("coord").struct["value"].str.extract(r"\(([-\d\.]+) "),
).filter(pl.col("lat").is_not_null(), pl.col("lon").is_not_null())
fixed

qid,label,image,depicted_qid,depicted,dod,country,lat,lon
str,str,str,str,str,str,str,str,str
"""Q2183129""","""Equestrian statue of William I…","""https://upload.wikimedia.org/w…","""Q129987""","""William III of England""","""1702-03-19T00:00:00Z""","""Netherlands""","""51.589722""","""4.776111"""
"""Q2437806""","""Monument to Giuseppe Garibaldi""","""https://upload.wikimedia.org/w…","""Q539""","""Giuseppe Garibaldi""","""1882-06-02T00:00:00Z""","""Argentina""","""-34.581313917""","""-58.421086888"""
"""Q2484700""","""monument of Leopold II in Oste…","""https://upload.wikimedia.org/w…","""Q12967""","""Leopold II of Belgium""","""1909-12-17T00:00:00Z""","""Belgium""","""51.22727""","""2.90434"""
"""Q2484700""","""monument of Leopold II in Oste…","""https://upload.wikimedia.org/w…","""Q12967""","""Leopold II of Belgium""","""1909-12-17T00:00:00Z""","""Belgium""","""51.22727""","""2.90434"""
"""Q2511913""","""equestrian statue of Marshal M…","""https://upload.wikimedia.org/w…","""Q152306""","""Carl Gustaf Emil Mannerheim""","""1951-01-27T00:00:00Z""","""Finland""","""60.171555""","""24.936398"""
…,…,…,…,…,…,…,…,…
"""Q125809468""","""Monument to Umberto I di Savoi…","""https://upload.wikimedia.org/w…","""Q153688""","""Umberto I of Italy""","""1900-07-29T00:00:00Z""","""Italy""","""37.513299966""","""15.082806711"""
"""Q126963135""","""Monument to Ferdinando of Savo…","""https://upload.wikimedia.org/w…","""Q459441""","""Prince Ferdinand, Duke of Geno…","""1855-02-10T00:00:00Z""","""Italy""","""45.068887411""","""7.677089794"""
"""Q132191679""","""equestrian statue of Viscount …","""https://upload.wikimedia.org/w…","""Q335994""","""Hugh Gough, 1st Viscount Gough""","""1869-03-02T00:00:00Z""","""United Kingdom""","""55.5262""","""-1.9117"""
"""Q133261457""","""Q133261457""","""https://upload.wikimedia.org/w…","""Q1060796""","""John IV of Portugal""","""1656-11-06T00:00:00Z""","""Portugal""","""38.782581""","""-7.421233"""


In [6]:
def get_location_photon(loc: dict[str, str]) -> dict:
    url = "https://photon.komoot.io/reverse"
    params = {
        "lat": loc["lat"],
        "lon": loc["lon"],
    }
    response = requests.get(
        url, params=params, timeout=5, headers={"accept-language": "en-US,en;q=0.9"}
    )
    for p in response.json()["features"]:
        if p.get("properties", {}).get("country"):
            return {
                k: v for k, v in p["properties"].items() if k in ["city", "country"]
            }
    raise Exception(f"No country found for {loc}")

In [7]:
d = {}
for row in tqdm(fixed.iter_rows(named=True), total=len(fixed)):
    d[(row["lat"], row["lon"])] = get_location_photon(row)

  0%|          | 0/538 [00:00<?, ?it/s]

In [8]:
fixed = (
    fixed.unique("qid")
    .with_columns(
        location=pl.struct(["lat", "lon"])
        .map_elements(
            lambda row: d.get((row["lat"], row["lon"])),
            return_dtype=pl.Struct(
                [pl.Field("city", pl.Utf8), pl.Field("country", pl.Utf8)]
            ),
        )
        .name.prefix_fields("photon_")
    )
    .unnest("location")
)
fixed.head()

qid,label,image,depicted_qid,depicted,dod,country,lat,lon,photon_country,photon_city
str,str,str,str,str,str,str,str,str,str,str
"""Q125802369""","""Statue of Imre Thököly (Kežmar…","""https://upload.wikimedia.org/w…","""Q357151""","""Imre Thököly""","""1705-09-13T00:00:00Z""","""Slovakia""","""49.13923""","""20.43316""","""Slovakia""","""Kežmarok"""
"""Q46999400""","""Robert E. Lee Monument""","""https://upload.wikimedia.org/w…","""Q165557""","""Robert E. Lee""","""1870-10-12T00:00:00Z""","""United States""","""37.553844""","""-77.460109""","""United States""","""Richmond"""
"""Q3968677""","""El Caballito""","""https://upload.wikimedia.org/w…","""Q183226""","""Charles IV of Spain""","""1819-01-20T00:00:00Z""","""Mexico""","""19.43605""","""-99.13948""","""Mexico""",
"""Q7631200""","""Subhas Chandra Bose statue""","""https://upload.wikimedia.org/w…","""Q2153""","""Subhas Chandra Bose""","""1945-08-18T00:00:00Z""","""India""","""22.60179""","""88.37363""","""India""","""Kolkata"""
"""Q128059917""","""Q128059917""","""https://upload.wikimedia.org/w…","""Q48438""","""Saint George""","""0303-04-24T00:00:00Z""","""Czech Republic""","""50.8632375""","""14.4819931""","""Czechia""","""Chřibská"""
