### 203_sanity_check_WD_OSM

* Issue #203
* This notebook 203_sanity_check_WD_OSM.ipynb

In [14]:
import requests
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON

# -------------------------------
# 1. Get reserves from OSM
# -------------------------------
overpass_url = "https://overpass-api.de/api/interpreter"
query = """
[out:json][timeout:180];
rel(54391)->.stockholm_lan;
.stockholm_lan map_to_area -> .area_stockholm;

(
  node["leisure"="nature_reserve"](area.area_stockholm);
  way["leisure"="nature_reserve"](area.area_stockholm);
  rel["leisure"="nature_reserve"](area.area_stockholm);

  node["boundary"="protected_area"]["protect_class"](area.area_stockholm);
  way["boundary"="protected_area"]["protect_class"](area.area_stockholm);
  rel["boundary"="protected_area"]["protect_class"](area.area_stockholm);
);

out body;
>;
out skel qt;
"""

resp = requests.get(overpass_url, params={'data': query})
data = resp.json()

osm_reserves = []
for el in data.get("elements", []):
    if "tags" in el:
        tags = el["tags"]
        osm_reserves.append({
            "osm_id": f"{el['type']}/{el['id']}",
            "name": tags.get("name"),
            "ref:NVRID": tags.get("ref:NVRID"),
            "wikidata": tags.get("wikidata"),
            "website_osm": tags.get("website"),
        })

df_osm = pd.DataFrame(osm_reserves).drop_duplicates(subset=["osm_id"]).reset_index(drop=True)

print(f"Found {len(df_osm)} reserves in OSM Stockholm län")
display(df_osm.head())


# -------------------------------
# 2. Get reserves from Wikidata
# -------------------------------
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery("""
SELECT ?item ?itemLabel ?NVRID ?geojson ?osmRel ?osmWay ?website WHERE {
  ?item wdt:P31 wd:Q179049;            # instance of nature reserve
        wdt:P131* wd:Q104231.          # located in Stockholm County

  OPTIONAL { ?item wdt:P856 ?website. }
  OPTIONAL { ?item wdt:P3896 ?geojson. }
  OPTIONAL { ?item wdt:P402 ?osmRel. }
  OPTIONAL { ?item wdt:P10689 ?osmWay. }
  OPTIONAL { ?item wdt:P1281 ?NVRID. } # NVR ID
  SERVICE wikibase:label { bd:serviceParam wikibase:language "sv,en". }
}
""")
sparql.setReturnFormat(JSON)

results = sparql.query().convert()

wd_reserves = []
for r in results["results"]["bindings"]:
    wd_reserves.append({
        "wikidata": r["item"]["value"].split("/")[-1],
        "label": r.get("itemLabel", {}).get("value"),
        "NVRID": r.get("NVRID", {}).get("value"),
        "geojson": r.get("geojson", {}).get("value"),
        "osmRel": r.get("osmRel", {}).get("value"),
        "osmWay": r.get("osmWay", {}).get("value"),
        "website_wd": r.get("website", {}).get("value")
    })

df_wd = pd.DataFrame(wd_reserves).drop_duplicates(subset=["wikidata"]).reset_index(drop=True)

print(f"Found {len(df_wd)} reserves in Wikidata (Stockholm län)")
display(df_wd.head())


# -------------------------------
# 3. Merge OSM ↔ Wikidata
# -------------------------------
df_merged = df_osm.merge(df_wd, on="wikidata", how="outer", suffixes=("_osm", "_wd"))

# -------------------------------
# 4. Sanity checks
# -------------------------------
# Missing Wikidata in OSM
missing_wd = df_merged[df_merged["wikidata"].isna() | df_merged["wikidata"].eq("")]
print("Reserves missing Wikidata in OSM:", len(missing_wd))

# Missing NVRID
missing_nvrid = df_merged[df_merged["NVRID"].isna() & df_merged["ref:NVRID"].isna()]
print("Reserves missing NVRID:", len(missing_nvrid))

# Website comparison
df_merged["website_match"] = df_merged.apply(
    lambda row: row["website_osm"] == row["website_wd"] if pd.notna(row["website_osm"]) and pd.notna(row["website_wd"]) else None,
    axis=1
)

mismatched_websites = df_merged[df_merged["website_match"] == False]

print("Reserves with mismatched websites:", len(mismatched_websites))

# -------------------------------
# 5. Display merged table
# -------------------------------
display(df_merged.head(30))


Found 393 reserves in OSM Stockholm län


Unnamed: 0,osm_id,name,ref:NVRID,wikidata,website_osm
0,node/6293895037,Dyviklövängars naturreservat,,,
1,way/56660225,Mödomen,,,
2,way/56666596,Fågelgrundet,,,
3,way/103223526,Svalgarns naturreservat,2000166.0,Q30162772,https://www.upplands-bro.se/uppleva-och-gora/n...
4,way/103223545,Talbyskogen,,Q10690208,


Found 374 reserves in Wikidata (Stockholm län)


Unnamed: 0,wikidata,label,NVRID,geojson,osmRel,osmWay,website_wd
0,Q29580026,Häverö prästängs naturreservat,,http://commons.wikimedia.org/data/main/Data:/S...,,103228133.0,https://www.lansstyrelsen.se/stockholm/besoksm...
1,Q30157780,Oxön,,http://commons.wikimedia.org/data/main/Data:/S...,,,
2,Q30157868,Gränskäret,,http://commons.wikimedia.org/data/main/Data:/S...,,103225936.0,
3,Q30158128,Lidö naturreservat,,http://commons.wikimedia.org/data/main/Data:/S...,1463761.0,,https://www.lansstyrelsen.se/stockholm/besoksm...
4,Q30158284,Mornäsan,,http://commons.wikimedia.org/data/main/Data:/S...,,103226788.0,


Reserves missing Wikidata in OSM: 33
Reserves missing NVRID: 208
Reserves with mismatched websites: 17


Unnamed: 0,osm_id,name,ref:NVRID,wikidata,website_osm,label,NVRID,geojson,osmRel,osmWay,website_wd,website_match
0,node/6293895037,Dyviklövängars naturreservat,,,,,,,,,,
1,way/56660225,Mödomen,,,,,,,,,,
2,way/56666596,Fågelgrundet,,,,,,,,,,
3,way/163794719,,,,,,,,,,,
4,way/353421146,Biotopskydd,,,,,,,,,,
5,way/447777489,,,,,,,,,,,
6,way/531344162,Mölnviks biotopskyddsområde,,,,,,,,,,
7,way/531344167,Farstaborgs biotopskyddsområde,,,,,,,,,,
8,way/577604971,Biotopskydd,,,,,,,,,,
9,way/671858677,,,,,,,,,,,


### What is needed to be cleaned 

In [17]:
# -------------------------------
# 6. Flag geometry coverage
# -------------------------------
def geometry_status(row):
    if pd.notna(row["geojson"]) or pd.notna(row["osmRel"]) or pd.notna(row["osmWay"]):
        return "✅ Wikidata links to geometry"
    elif str(row["osm_id"]).startswith("way/") or str(row["osm_id"]).startswith("relation/"):
        return "✅ Geometry in OSM"
    else:
        return "⚠️ Missing geometry"

df_merged["geometry_status"] = df_merged.apply(geometry_status, axis=1)

# -------------------------------
# 7. Create TODO list
# -------------------------------
todos = []

for _, row in df_merged.iterrows():
    issues = []
    if pd.isna(row["wikidata"]):
        issues.append("Add Wikidata")
    if pd.isna(row["NVRID"]) and pd.isna(row["ref:NVRID"]):
        issues.append("Add NVRID")
    if row["website_match"] is False:
        issues.append("Fix website mismatch")
    if row["geometry_status"].startswith("⚠️"):
        issues.append("Add geometry")

    if issues:
        todos.append({
            "name": row.get("name") or row.get("label"),
            "osm_link": f"https://www.openstreetmap.org/{row['osm_id']}" if pd.notna(row["osm_id"]) else None,
            "wikidata_link": f"https://www.wikidata.org/wiki/{row['wikidata']}" if pd.notna(row["wikidata"]) else None,
            "issues": ", ".join(issues)
        })

df_todo = pd.DataFrame(todos)

display(df_todo.head(30))
print(f"Total TODOs: {len(df_todo)}")


Unnamed: 0,name,osm_link,wikidata_link,issues
0,Dyviklövängars naturreservat,https://www.openstreetmap.org/node/6293895037,,"Add Wikidata, Add NVRID, Add geometry"
1,Mödomen,https://www.openstreetmap.org/way/56660225,,"Add Wikidata, Add NVRID"
2,Fågelgrundet,https://www.openstreetmap.org/way/56666596,,"Add Wikidata, Add NVRID"
3,,https://www.openstreetmap.org/way/163794719,,"Add Wikidata, Add NVRID"
4,Biotopskydd,https://www.openstreetmap.org/way/353421146,,"Add Wikidata, Add NVRID"
5,,https://www.openstreetmap.org/way/447777489,,"Add Wikidata, Add NVRID"
6,Mölnviks biotopskyddsområde,https://www.openstreetmap.org/way/531344162,,"Add Wikidata, Add NVRID"
7,Farstaborgs biotopskyddsområde,https://www.openstreetmap.org/way/531344167,,"Add Wikidata, Add NVRID"
8,Biotopskydd,https://www.openstreetmap.org/way/577604971,,"Add Wikidata, Add NVRID"
9,,https://www.openstreetmap.org/way/671858677,,"Add Wikidata, Add NVRID"


Total TODOs: 227


In [18]:
# Convert TODO DataFrame into Markdown table

def df_to_markdown(df):
    lines = []
    # Header
    lines.append("| Name | OSM Link | Wikidata Link | Issues |")
    lines.append("|------|----------|---------------|--------|")
    
    for _, row in df.iterrows():
        name = row["name"] or ""
        
        # OSM link
        if pd.notna(row["osm_link"]):
            osm_link = f"[OSM]({row['osm_link']})"
        else:
            osm_link = "*(missing)*"
        
        # Wikidata link
        if pd.notna(row["wikidata_link"]):
            qid = row["wikidata_link"].split("/")[-1]
            wd_link = f"[{qid}]({row['wikidata_link']})"
        else:
            wd_link = "*(missing)*"
        
        issues = row["issues"] or ""
        
        lines.append(f"| {name} | {osm_link} | {wd_link} | {issues} |")
    
    return "\n".join(lines)

markdown_table = df_to_markdown(df_todo)
print(markdown_table)


| Name | OSM Link | Wikidata Link | Issues |
|------|----------|---------------|--------|
| Dyviklövängars naturreservat | [OSM](https://www.openstreetmap.org/node/6293895037) | *(missing)* | Add Wikidata, Add NVRID, Add geometry |
| Mödomen | [OSM](https://www.openstreetmap.org/way/56660225) | *(missing)* | Add Wikidata, Add NVRID |
| Fågelgrundet | [OSM](https://www.openstreetmap.org/way/56666596) | *(missing)* | Add Wikidata, Add NVRID |
| nan | [OSM](https://www.openstreetmap.org/way/163794719) | *(missing)* | Add Wikidata, Add NVRID |
| Biotopskydd | [OSM](https://www.openstreetmap.org/way/353421146) | *(missing)* | Add Wikidata, Add NVRID |
| nan | [OSM](https://www.openstreetmap.org/way/447777489) | *(missing)* | Add Wikidata, Add NVRID |
| Mölnviks biotopskyddsområde | [OSM](https://www.openstreetmap.org/way/531344162) | *(missing)* | Add Wikidata, Add NVRID |
| Farstaborgs biotopskyddsområde | [OSM](https://www.openstreetmap.org/way/531344167) | *(missing)* | Add Wikidata, Add

In [23]:
!pip install rapidfuzz



In [27]:
import requests
import xml.etree.ElementTree as ET
import geopandas as gpd
import os

feed_url = "https://geodata.naturvardsverket.se/atom/inspire/ps/SE_ProtectedSitesN2K_serviceFeed.xml"

resp = requests.get(feed_url)
root = ET.fromstring(resp.content)

ns = {"atom": "http://www.w3.org/2005/Atom"}
entries = root.findall("atom:entry", ns)

download_links = []
for e in entries:
    link = e.find("atom:link", ns).attrib.get("href")
    title = e.find("atom:title", ns).text
    download_links.append((title, link))

print("Found dataset links:")
for title, link in download_links:
    print(f"- {title}: {link}")

# Pick one link manually for now
title, link = download_links[0]
print("Downloading:", title, link)

resp = requests.get(link)

# Save to file with proper extension
fname = os.path.basename(link.split("?")[0])
with open(fname, "wb") as f:
    f.write(resp.content)

print("Saved as:", fname)

# Try reading directly
try:
    gdf = gpd.read_file(fname)
    print("Loaded directly into GeoDataFrame")
except Exception as e:
    print("Direct load failed:", e)


Found dataset links:
- Nedladdningstjänst för data som ingår i INSPIRE bilaga 1, Natura 2000, Art- och habitatdirektivet (SCI): https://www.geodata.se/geodataportalen/srv/eng/csw-inspire?request=GetRecordById&service=CSW&version=2.0.2&elementSetName=full&id=945e918f-8426-4155-8fd6-3f780a85dd8f&outputSchema=csw:IsoRecord
- Nedladdningstjänst för data som ingår i INSPIRE bilaga 1, Natura 2000, Fågeldirektivet (SPA): https://www.geodata.se/geodataportalen/srv/eng/csw-inspire?request=GetRecordById&service=CSW&version=2.0.2&elementSetName=full&id=a80bf3d7-e70c-42d1-9b8d-8148e53e011d&outputSchema=csw:IsoRecord
Downloading: Nedladdningstjänst för data som ingår i INSPIRE bilaga 1, Natura 2000, Art- och habitatdirektivet (SCI) https://www.geodata.se/geodataportalen/srv/eng/csw-inspire?request=GetRecordById&service=CSW&version=2.0.2&elementSetName=full&id=945e918f-8426-4155-8fd6-3f780a85dd8f&outputSchema=csw:IsoRecord
Saved as: csw-inspire
Direct load failed: index 0 is out of bounds for axis 0

In [24]:
from rapidfuzz import process, fuzz

def normalize_name(name):
    if not isinstance(name, str):
        return ""
    name = name.lower()
    name = name.replace("naturreservat", "").replace("nr", "")
    name = name.strip()
    return name

# Normalize names
df_osm["name_norm"] = df_osm["name"].apply(normalize_name)
df_wd["label_norm"] = df_wd["label"].apply(normalize_name)
df_sn["name_norm"] = df_sn["NAMN"].apply(normalize_name)

# Try fuzzy match OSM → Wikidata
matches = []
for idx, row in df_osm.iterrows():
    name = row["name_norm"]
    match, score, _ = process.extractOne(name, df_wd["label_norm"], scorer=fuzz.ratio)
    if score > 80:  # threshold
        wd_row = df_wd[df_wd["label_norm"] == match].iloc[0]
        matches.append({
            "osm_id": row["osm_id"],
            "osm_name": row["name"],
            "wikidata": wd_row["wikidata"],
            "wd_label": wd_row["label"],
            "score": score
        })

df_matches = pd.DataFrame(matches)


NameError: name 'df_sn' is not defined

In [28]:
import requests
import xml.etree.ElementTree as ET
import geopandas as gpd
import os

def get_atom_entries(feed_url):
    """Parse an ATOM feed and return (title, link) tuples."""
    resp = requests.get(feed_url)
    root = ET.fromstring(resp.content)
    ns = {"atom": "http://www.w3.org/2005/Atom"}
    entries = root.findall("atom:entry", ns)
    return [(e.find("atom:title", ns).text, e.find("atom:link", ns).attrib.get("href")) for e in entries]

def get_metadata_links(meta_url):
    """Parse CSW metadata record and extract dataset URLs."""
    resp = requests.get(meta_url)
    root = ET.fromstring(resp.content)
    urls = []
    for elem in root.iter():
        if elem.text and "http" in elem.text:
            urls.append(elem.text.strip())
    return urls

def download_dataset(url, outdir="data"):
    """Download dataset file and return path."""
    os.makedirs(outdir, exist_ok=True)
    fname = os.path.join(outdir, os.path.basename(url.split("?")[0]))
    r = requests.get(url)
    with open(fname, "wb") as f:
        f.write(r.content)
    return fname

# --- Step 1: ATOM feed for ProtectedSites
feed_url = "https://geodata.naturvardsverket.se/atom/inspire/ps/SE_ProtectedSitesN2K_serviceFeed.xml"
entries = get_atom_entries(feed_url)

print("ATOM entries found:")
for title, link in entries:
    print("-", title, link)

# --- Step 2: Pick one entry and follow to metadata
title, meta_url = entries[0]
print("\nFollowing metadata link:", meta_url)
dataset_links = get_metadata_links(meta_url)

print("Dataset links found in metadata:")
for dl in dataset_links:
    print("-", dl)

# --- Step 3: Pick first dataset link and download
if dataset_links:
    dataset_url = [u for u in dataset_links if u.endswith((".zip", ".gml", ".gpkg"))][0]
    fname = download_dataset(dataset_url)
    print("\nDownloaded dataset:", fname)

    # --- Step 4: Try reading with GeoPandas
    try:
        gdf = gpd.read_file(fname)
        print("Loaded dataset into GeoDataFrame with", len(gdf), "features")
        print(gdf.head())
    except Exception as e:
        print("Could not load with GeoPandas:", e)


ATOM entries found:
- Nedladdningstjänst för data som ingår i INSPIRE bilaga 1, Natura 2000, Art- och habitatdirektivet (SCI) https://www.geodata.se/geodataportalen/srv/eng/csw-inspire?request=GetRecordById&service=CSW&version=2.0.2&elementSetName=full&id=945e918f-8426-4155-8fd6-3f780a85dd8f&outputSchema=csw:IsoRecord
- Nedladdningstjänst för data som ingår i INSPIRE bilaga 1, Natura 2000, Fågeldirektivet (SPA) https://www.geodata.se/geodataportalen/srv/eng/csw-inspire?request=GetRecordById&service=CSW&version=2.0.2&elementSetName=full&id=a80bf3d7-e70c-42d1-9b8d-8148e53e011d&outputSchema=csw:IsoRecord

Following metadata link: https://www.geodata.se/geodataportalen/srv/eng/csw-inspire?request=GetRecordById&service=CSW&version=2.0.2&elementSetName=full&id=945e918f-8426-4155-8fd6-3f780a85dd8f&outputSchema=csw:IsoRecord
Dataset links found in metadata:
- http://www.opengis.net/def/crs/EPSG/0/3006
- Creative commons  CC0 1.0 Universiell

DU HAR TILLSTÅND ATT:
Personen som associerat ett ve

In [29]:
import zipfile, os, geopandas as gpd

zip_path = "data/SCI_Rikstackande.zip"
extract_dir = "data/SCI_Rikstackande"

# Unzip
with zipfile.ZipFile(zip_path, "r") as z:
    z.extractall(extract_dir)

# Find .shp file inside
shp_files = [f for f in os.listdir(extract_dir) if f.endswith(".shp")]
print("Shapefiles found:", shp_files)

# Load the first shapefile
if shp_files:
    shp_path = os.path.join(extract_dir, shp_files[0])
    gdf = gpd.read_file(shp_path)
    print("Loaded", len(gdf), "features")
    print(gdf.head())


Shapefiles found: []


In [30]:
import zipfile

with zipfile.ZipFile("data/SCI_Rikstackande.zip") as z:
    print(z.namelist())


['SCI_alvar_AC_lan/', 'SCI_alvar_BD_lan/', 'SCI_ej_alvar_rikstackande/', 'SCI_alvar_AC_lan/SCI_alvar_AC_lan.cpg', 'SCI_alvar_AC_lan/SCI_alvar_AC_lan.dbf', 'SCI_alvar_AC_lan/SCI_alvar_AC_lan.prj', 'SCI_alvar_AC_lan/SCI_alvar_AC_lan.shp', 'SCI_alvar_AC_lan/SCI_alvar_AC_lan.shx', 'SCI_alvar_BD_lan/SCI_alvar_BD_lan.cpg', 'SCI_alvar_BD_lan/SCI_alvar_BD_lan.dbf', 'SCI_alvar_BD_lan/SCI_alvar_BD_lan.prj', 'SCI_alvar_BD_lan/SCI_alvar_BD_lan.shp', 'SCI_alvar_BD_lan/SCI_alvar_BD_lan.shx', 'SCI_ej_alvar_rikstackande/SCI_ej_alvar_rikstackande.cpg', 'SCI_ej_alvar_rikstackande/SCI_ej_alvar_rikstackande.dbf', 'SCI_ej_alvar_rikstackande/SCI_ej_alvar_rikstackande.prj', 'SCI_ej_alvar_rikstackande/SCI_ej_alvar_rikstackande.shp', 'SCI_ej_alvar_rikstackande/SCI_ej_alvar_rikstackande.shx']


In [32]:
import geopandas as gpd
import zipfile, os

zip_path = "data/SCI_Rikstackande.zip"
extract_dir = "data/SCI_Rikstackande"

# Extract
with zipfile.ZipFile(zip_path, "r") as z:
    z.extractall(extract_dir)

# Find shapefiles recursively
shp_files = []
for root, dirs, files in os.walk(extract_dir):
    for f in files:
        if f.endswith(".shp"):
            shp_files.append(os.path.join(root, f))

print("Shapefiles found:", shp_files)

# Load one shapefile
gdf = gpd.read_file(shp_files[0])
print("Loaded", len(gdf), "features from", shp_files[0])
print(gdf.head())


Shapefiles found: ['data/SCI_Rikstackande/SCI_ej_alvar_rikstackande/SCI_ej_alvar_rikstackande.shp', 'data/SCI_Rikstackande/SCI_alvar_BD_lan/SCI_alvar_BD_lan.shp', 'data/SCI_Rikstackande/SCI_alvar_AC_lan/SCI_alvar_AC_lan.shp']
Loaded 4016 features from data/SCI_Rikstackande/SCI_ej_alvar_rikstackande/SCI_ej_alvar_rikstackande.shp
               NAMN                                            BEVPLAN  \
0        Abborravan  https://geodata.naturvardsverket.se/handlingar...   
1  Abborrtjärnsberg  https://geodata.naturvardsverket.se/handlingar...   
2       Abborrträsk  https://geodata.naturvardsverket.se/handlingar...   
3          Abborrås  https://geodata.naturvardsverket.se/handlingar...   
4            Abisko  https://geodata.naturvardsverket.se/handlingar...   

   SITE_CODE OMRADESTYP                            UPPLAMNARE     KOMMUN  \
0  SE0810361        SCI     Länsstyrelsen i Västerbottens län   Lycksele   
1  SE0610084        SCI         Länsstyrelsen i Värmlands län     Torsby 

### 🔧 Python Code to Download + Load Stockholm län Naturreservat


In [35]:
import requests, zipfile, os, geopandas as gpd

# --- Download ---
url = "https://geodata.naturvardsverket.se/nedladdning/naturvardsregistret/NR.zip"
zip_path = "NR.zip"
extract_dir = "data/NR"

if not os.path.exists(zip_path):
    print("Downloading Naturreservat dataset...")
    r = requests.get(url)
    with open(zip_path, "wb") as f:
        f.write(r.content)

# --- Unzip ---
with zipfile.ZipFile(zip_path, "r") as z:
    z.extractall(extract_dir)

# --- Find .shp ---
shp_files = []
for root, dirs, files in os.walk(extract_dir):
    for f in files:
        if f.endswith(".shp"):
            shp_files.append(os.path.join(root, f))

print("Shapefiles found:", shp_files)

# --- Load into GeoPandas ---
gdf = gpd.read_file(shp_files[0])

print(gdf.columns)         # See available fields
print(gdf["LAN"].unique()) # Inspect unique values for county
print(gdf.head())

print("Total Naturreservat in Stockholm län:", len(gdf_stockholm))
print(gdf_stockholm[["NVRID", "NAMN", "KOMMUN"]].head())


Shapefiles found: ['data/NR/NR/NR_polygon.shp']
Index(['NVRID', 'NAMN', 'SKYDDSTYP', 'BESLSTATUS', 'URSBESLDAT', 'IKRAFTDATF',
       'URSGALLDAT', 'SENGALLDAT', 'TILLSYNSMH', 'PROVNMHDIS', 'PROVNMHTIL',
       'LAN', 'KOMMUN', 'IUCNKAT', 'FORVALTARE', 'AREA_HA', 'LAND_HA',
       'VATTEN_HA', 'SKOG_HA', 'GEOSTATUS', 'DIARIENR', 'LAGRUM', 'BESLMYND',
       'geometry'],
      dtype='object')
['Östergötlands Län' 'Värmlands Län' 'Blekinge Län' 'Västerbottens Län'
 'Gotlands Län' 'Uppsala Län' 'Norrbottens Län' 'Dalarnas Län'
 'Stockholms Län' 'Jämtlands Län' 'Västra Götalands Län'
 'Västernorrlands Län' 'Jönköpings Län' 'Gävleborgs Län' 'Skåne Län'
 'Västmanlands Län' 'Hallands Län' 'Kronobergs Län' 'Örebro Län'
 'Kalmar Län' 'Södermanlands Län']
     NVRID            NAMN      SKYDDSTYP BESLSTATUS URSBESLDAT IKRAFTDATF  \
0  2032070  Styra kalkkärr  Naturreservat   Gällande 2012-12-06        NaT   
1  2002137  Högbergsfältet  Naturreservat   Gällande 1979-11-19        NaT   
2  2001440

In [36]:
gdf_stockholm = gdf[gdf["LAN"].str.lower().str.contains("stockholm")]

print("Naturreservat in Stockholm län:", len(gdf_stockholm))
print(gdf_stockholm[["NVRID", "NAMN", "KOMMUN"]].head())


Naturreservat in Stockholm län: 376
      NVRID                 NAMN      KOMMUN
11  2000008           Yttereneby  Södertälje
29  2061101       Telegrafberget      Tyresö
33  2001109             Vindalsö      Värmdö
38  2005573  Flemingsbergsskogen    Huddinge
55  2005594             Mornäsan   Norrtälje


### Merge code

Step 1. Setup helpers

In [38]:
import pandas as pd
import geopandas as gpd
from SPARQLWrapper import SPARQLWrapper, JSON
import requests
import unicodedata
from rapidfuzz import process, fuzz

# Helper: normalize names for fuzzy matching
def normalize_name(name):
    if not isinstance(name, str):
        return ""
    name = unicodedata.normalize("NFKD", name).encode("ascii", "ignore").decode("utf-8")
    name = name.lower()
    name = name.replace("naturreservat", "").replace("nr", "").strip()
    return name


🔧 Step 2. Load NVR (Naturvårdsverket)

In [39]:
gdf_nvr = gpd.read_file("data/NR/NR/NR_polygon.shp")

# Filter Stockholm län
gdf_nvr = gdf_nvr[gdf_nvr["LAN"] == "Stockholms Län"]

# Keep relevant columns
df_nvr = gdf_nvr[["NVRID", "NAMN", "KOMMUN"]].copy()
df_nvr["name_norm"] = df_nvr["NAMN"].apply(normalize_name)

print("NVR reserves in Stockholm:", len(df_nvr))


NVR reserves in Stockholm: 376


🔧 Step 3. Load OSM (Overpass API)

In [46]:
import requests

overpass_url = "https://overpass-api.de/api/interpreter"
query = """
[out:json][timeout:90];
relation(54391)->.area;  // Stockholm län
(
  node(area.area)["leisure"="nature_reserve"];
  way(area.area)["leisure"="nature_reserve"];
  relation(area.area)["leisure"="nature_reserve"];
);
out tags center;
"""

resp = requests.get(overpass_url, params={"data": query})

# Debug response
print("HTTP status:", resp.status_code)
print("First 200 chars:\n", resp.text[:200])

if resp.status_code == 200 and resp.headers.get("Content-Type", "").startswith("application/json"):
    data = resp.json()
    elements = data["elements"]
    print("Got elements:", len(elements))
else:
    raise RuntimeError("Overpass returned non-JSON response, try again or use another endpoint.")


HTTP status: 200
First 200 chars:
 {
  "version": 0.6,
  "generator": "Overpass API 0.7.62.8 e802775f",
  "osm3s": {
    "timestamp_osm_base": "2025-09-29T14:04:15Z",
    "timestamp_areas_base": "2025-09-29T10:04:35Z",
    "copyright":
Got elements: 0


🔧 Step 4. Load Wikidata (SPARQL)

In [41]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setReturnFormat(JSON)
sparql.setQuery("""
SELECT ?item ?itemLabel ?NVRID ?website ?osmRel ?osmWay WHERE {
  ?item wdt:P31 wd:Q179049;
        wdt:P131* wd:Q104231;
        wdt:P1281 ?NVRID.
  OPTIONAL { ?item wdt:P856 ?website. }
  OPTIONAL { ?item wdt:P402 ?osmRel. }
  OPTIONAL { ?item wdt:P10689 ?osmWay. }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "sv,en". }
}
""")

results = sparql.query().convert()

records = []
for r in results["results"]["bindings"]:
    rec = {
        "qid": r["item"]["value"].split("/")[-1],
        "label": r.get("itemLabel", {}).get("value"),
        "NVRID": r["NVRID"]["value"],
        "website": r.get("website", {}).get("value"),
        "osmRel": r.get("osmRel", {}).get("value"),
        "osmWay": r.get("osmWay", {}).get("value"),
    }
    rec["name_norm"] = normalize_name(rec["label"])
    records.append(rec)

df_wd = pd.DataFrame(records)
print("Wikidata reserves:", len(df_wd))


Wikidata reserves: 0


🔧 Step 6. Export TODO table

In [42]:
def make_osm_link(osm_id):
    if pd.isna(osm_id): return "*(missing)*"
    t, i = osm_id.split("/")
    return f"[OSM {t} {i}](https://www.openstreetmap.org/{t}/{i})"

def make_wd_link(qid):
    if pd.isna(qid): return "*(missing)*"
    return f"[{qid}](https://www.wikidata.org/wiki/{qid})"

rows = []
for _, row in df_master.iterrows():
    issues = []
    if row["flag_osm"]: issues.append("Missing in OSM")
    if row["flag_wd"]: issues.append("Missing in Wikidata")
    if row["flag_site_mismatch"]: issues.append("Website mismatch")

    rows.append({
        "Name": row["NAMN"],
        "NVRID": row["NVRID"],
        "OSM": make_osm_link(row.get("osm_id")),
        "Wikidata": make_wd_link(row.get("qid")),
        "Issues": ", ".join(issues) if issues else "✅"
    })

df_todo = pd.DataFrame(rows)
print(df_todo.head())

# Save to Markdown
with open("TODO_reserves.md", "w") as f:
    f.write(df_todo.to_markdown(index=False))


NameError: name 'df_master' is not defined

🔧 Step 7. Add fuzzy matching for OSM ↔ NVR


In [43]:
from rapidfuzz import process, fuzz

# Only OSM reserves without NVRID
df_osm_nomatch = df_osm[df_osm["ref_NVRID"].isna()].copy()

matches = []
for idx, row in df_osm_nomatch.iterrows():
    if not row["name_norm"]:
        continue
    match = process.extractOne(
        row["name_norm"], 
        df_nvr["name_norm"], 
        scorer=fuzz.ratio
    )
    if match and match[1] >= 85:  # threshold
        # get NVR record
        nvr_row = df_nvr[df_nvr["name_norm"] == match[0]].iloc[0]
        matches.append({
            "osm_id": row["osm_id"],
            "osm_name": row["name"],
            "NVRID": nvr_row["NVRID"],
            "nvr_name": nvr_row["NAMN"],
            "score": match[1]
        })

df_osm_fuzzy = pd.DataFrame(matches)
print("Fuzzy matches found:", len(df_osm_fuzzy))


KeyError: 'ref_NVRID'

## New try

In [47]:
# export_OSM_leisure_nature_reserve.geojson

In [50]:
import os

cwd = os.getcwd()
print("Current working directory:", cwd)

Current working directory: /Users/salgo/Documents/GitHub/Stockholm_Archipelago_Trail/notebook


In [51]:
import geopandas as gpd

gdf_osm = gpd.read_file("export_OSM_leisure_nature_reserve.geojson")

# Extract tags we care about
records = []
for _, row in gdf_osm.iterrows():
    tags = row.get("tags", {})
    rec = {
        "osm_id": f"{row['osm_type']}/{row['osm_id']}" if "osm_id" in row else None,
        "name": tags.get("name"),
        "ref_NVRID": tags.get("ref:NVRID"),
        "wikidata": tags.get("wikidata"),
        "website": tags.get("website"),
    }
    records.append(rec)

import pandas as pd
df_osm = pd.DataFrame(records)

print("Loaded OSM reserves:", len(df_osm))
print(df_osm.head())


Loaded OSM reserves: 558
  osm_id  name ref_NVRID wikidata website
0   None  None      None     None    None
1   None  None      None     None    None
2   None  None      None     None    None
3   None  None      None     None    None
4   None  None      None     None    None


In [54]:
import geopandas as gpd

gdf_osm = gpd.read_file("export_OSM_leisure_nature_reserve.geojson")
print(gdf_osm.columns)
print(gdf_osm.head(2))


Index(['id', '@id', 'LAN', 'NVRID', 'access', 'addr:city', 'addr:postcode',
       'alt_name', 'boundary', 'comment', 'communication:amateur_radio:pota',
       'description', 'description:en', 'dog', 'fixme', 'governance_type',
       'historic', 'image', 'image:license', 'ist:area_ha', 'ist:land_ha',
       'ist:tillkomst', 'ist:vatten_ha', 'landuse', 'leaf_cycle', 'leaf_type',
       'leisure', 'lst:anmärkning', 'lst:area_ha', 'lst:metodbeskrivning',
       'lst:ref', 'lst:reviderad', 'lst:tillkomst', 'lst:url', 'lst_area_h',
       'lst_ref', 'name', 'name:en', 'natural', 'note', 'official_name',
       'old_name', 'old_ref:DIARIENR', 'old_ref:NVRID', 'old_ref:lst',
       'openfire', 'opening_hours', 'operator', 'operator:type',
       'operator:wikidata', 'place', 'protect_class', 'protection_title',
       'ref:DIARIENR', 'ref:NVRID', 'ref:OBJECTID', 'ref:lst', 'related_law',
       'short_name', 'site_status', 'source', 'start_date', 'type', 'url',
       'website', 'wikidata',

In [55]:
records = []
for _, row in gdf_osm.iterrows():
    rec = {
        "osm_id": row.get("id"),
        "name": row.get("name"),
        "ref_NVRID": row.get("ref:NVRID"),
        "wikidata": row.get("wikidata"),
        "website": row.get("website"),
    }
    records.append(rec)

df_osm = pd.DataFrame(records)
print("Loaded OSM reserves:", len(df_osm))
print(df_osm.head())


Loaded OSM reserves: 558
             osm_id                        name ref_NVRID   wikidata  \
0  relation/1381602  Abborrträsks naturreservat   2013015  Q10397880   
1  relation/1460258         Nynäs naturreservat   2001921  Q10604865   
2  relation/1460259                       Persö      None  Q30165688   
3  relation/1460260                      Ringsö      None  Q30171026   
4  relation/1460261                  Marvikarna      None   Q3361676   

                                             website  
0  https://www.nacka.se/boende-miljo/natur-och-pa...  
1                  https://nynasslott.se/reservatet/  
2                                               None  
3                                               None  
4                                               None  
