# 212 check links
* [#212](https://github.com/salgo60/Stockholm_Archipelago_Trail/issues/212)

In [3]:
import requests
import pandas as pd

# 1. Load your list of länsstyrelsen.se links
with open("lansstyrelsen_se_links.txt") as f:
    links = [line.strip() for line in f if line.strip()]

links_df = pd.DataFrame({"url": links})


In [9]:
links_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   url     367 non-null    object
dtypes: object(1)
memory usage: 3.0+ KB


In [14]:
import requests
import pandas as pd

# 1. Load your list of länsstyrelsen.se links
print("Step 1: Loading länsstyrelsen.se links...")
with open("lansstyrelsen_se_links.txt") as f:
    links = [line.strip() for line in f if line.strip()]

links_df = pd.DataFrame({"url": links})
print(f"  Loaded {len(links_df)} links from file")

# 2. Query Wikidata
print("Step 2: Querying Wikidata for nature reserves in Stockholms län...")
sparql = """
SELECT ?item ?itemLabel ?ref ?osm_rel ?osm_way WHERE {
  ?item wdt:P31 wd:Q179049;
        (wdt:P131/wdt:P131) wd:Q104231;
        wdt:P856 ?ref.
  OPTIONAL { ?item wdt:P402 ?osm_rel. }   # OSM relation ID
  OPTIONAL { ?item wdt:P10689 ?osm_way. } # OSM way ID
  SERVICE wikibase:label { bd:serviceParam wikibase:language "sv,en". }
}
"""

url = "https://query.wikidata.org/sparql"
r = requests.get(url, params={'query': sparql, 'format': 'json'})
data = r.json()

rows = []
for b in data["results"]["bindings"]:
    rows.append({
        "item": b["item"]["value"],
        "label": b.get("itemLabel", {}).get("value", ""),
        "ref": b.get("ref", {}).get("value", ""),
        "osm_rel": b.get("osm_rel", {}).get("value", None),
        "osm_way": b.get("osm_way", {}).get("value", None)
    })

wd_df = pd.DataFrame(rows)
print(f"  Retrieved {len(wd_df)} reserves from Wikidata")

# 3. Merge Wikidata results with länsstyrelsen list
print("Step 3: Merging Wikidata results with länsstyrelsen links...")
merged = links_df.merge(wd_df, left_on="url", right_on="ref", how="left")
merged["in_wikidata"] = merged["item"].notna()
print(f"  {merged['in_wikidata'].sum()} links found in Wikidata, {len(merged) - merged['in_wikidata'].sum()} missing")

# 4. For those with OSM references, query Overpass
print("Step 4: Checking OSM backlinks with Overpass API...")
overpass_url = "https://overpass-api.de/api/interpreter"

def check_osm(osm_id, osm_type="relation"):
    """
    osm_type: "relation" or "way"
    """
    if osm_id is None:
        return None, None
    
    q = f"""
    [out:json][timeout:25];
    {osm_type}({osm_id});
    out tags;
    """
    r = requests.get(overpass_url, params={'data': q})
    if r.status_code != 200:
        return None, None
    js = r.json()
    if "elements" not in js or len(js["elements"]) == 0:
        return None, None
    tags = js["elements"][0].get("tags", {})
    return tags.get("wikidata"), tags.get("website")

merged["osm_backlink_wd"] = None
merged["osm_backlink_site"] = None

for idx, row in merged.iterrows():
    if pd.notna(row["osm_rel"]):
        print(f"  Checking OSM relation {row['osm_rel']} ({row['label']})...")
        wd, site = check_osm(row["osm_rel"], "relation")
    elif pd.notna(row["osm_way"]):
        print(f"  Checking OSM way {row['osm_way']} ({row['label']})...")
        wd, site = check_osm(row["osm_way"], "way")
    else:
        wd, site = None, None
    
    merged.at[idx, "osm_backlink_wd"] = wd
    merged.at[idx, "osm_backlink_site"] = site

# 5. Consistency checks
merged["osm_ref_matches_wd"] = merged["osm_backlink_wd"] == merged["item"].str.replace("http://www.wikidata.org/entity/", "")
merged["osm_ref_matches_site"] = merged["osm_backlink_site"] == merged["url"]

print("Step 5: Finished cross-checks ✅")
print(merged[["url", "in_wikidata", "osm_rel", "osm_way", "osm_backlink_wd", "osm_backlink_site", "osm_ref_matches_wd", "osm_ref_matches_site"]].head(20))

# Save results
merged.to_csv("wikidata_osm_check.csv", index=False)
print("Results written to wikidata_osm_check.csv")


Step 1: Loading länsstyrelsen.se links...
  Loaded 367 links from file
Step 2: Querying Wikidata for nature reserves in Stockholms län...
  Retrieved 190 reserves from Wikidata
Step 3: Merging Wikidata results with länsstyrelsen links...
  150 links found in Wikidata, 222 missing
Step 4: Checking OSM backlinks with Overpass API...
  Checking OSM relation 1381602 (Abborrträsks naturreservat)...
  Checking OSM relation 2651046 (Alby naturreservat)...
  Checking OSM way 103228501 (Angarnssjöängens naturreservat)...
  Checking OSM relation 10626317 (Angödrommens naturreservat)...
  Checking OSM relation 1463750 (Arholma-Idö naturreservat)...
  Checking OSM way 103224764 (Arsläjans naturreservat)...
  Checking OSM way 103249546 (Askarehage naturreservat)...
  Checking OSM relation 9769162 (Aspdalssjön)...
  Checking OSM relation 18265864 (Barnens Ö naturreservat)...
  Checking OSM relation 10626309 (Biskopsö naturreservat)...
  Checking OSM way 103225890 (Björinge)...
  Checking OSM relatio

In [11]:
# 5. Check consistency
merged["osm_ref_matches_wd"] = merged["osm_backlink_wd"] == merged["item"].str.replace("http://www.wikidata.org/entity/", "")
merged["osm_ref_matches_site"] = merged["osm_backlink_site"] == merged["url"]


In [12]:
# 6. Save results
merged.to_csv("wikidata_osm_check.csv", index=False)


In [15]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 372 entries, 0 to 371
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   url                   372 non-null    object
 1   item                  150 non-null    object
 2   label                 150 non-null    object
 3   ref                   150 non-null    object
 4   osm_rel               102 non-null    object
 5   osm_way               36 non-null     object
 6   in_wikidata           372 non-null    bool  
 7   osm_backlink_wd       132 non-null    object
 8   osm_backlink_site     53 non-null     object
 9   osm_ref_matches_wd    372 non-null    bool  
 10  osm_ref_matches_site  372 non-null    bool  
dtypes: bool(3), object(8)
memory usage: 24.5+ KB


In [16]:
# Add QID column for convenience
merged["qid"] = merged["item"].str.replace("http://www.wikidata.org/entity/", "", regex=False)

# --- 1. Wikidata TODOs ---
wikidata_todos = []

# Missing in Wikidata -> add new item or at least add P856
missing_wd = merged[~merged["in_wikidata"]]
for _, row in missing_wd.iterrows():
    wikidata_todos.append({
        "url": row["url"],
        "todo": f"❌ Not in Wikidata → Create item for naturreservat and add P856: {row['url']}"
    })

# In Wikidata but missing OSM reference
missing_osm = merged[merged["in_wikidata"] & merged["osm_rel"].isna() & merged["osm_way"].isna()]
for _, row in missing_osm.iterrows():
    wikidata_todos.append({
        "url": row["url"],
        "qid": row["qid"],
        "todo": f"⚠️ {row['qid']} missing OSM reference → add P402 or P10689"
    })

# --- 2. OSM TODOs ---
osm_todos = []

# Missing Wikidata backlink in OSM
bad_wd_backlink = merged[merged["in_wikidata"] & (merged["osm_backlink_wd"].isna() | ~merged["osm_ref_matches_wd"])]
for _, row in bad_wd_backlink.iterrows():
    osm_todos.append({
        "qid": row["qid"],
        "osm_rel": row["osm_rel"],
        "osm_way": row["osm_way"],
        "todo": f"🔗 Add/Correct wikidata={row['qid']} in OSM relation/way"
    })

# Missing website backlink in OSM
bad_site_backlink = merged[(merged["osm_backlink_site"].isna() | ~merged["osm_ref_matches_site"])]
for _, row in bad_site_backlink.iterrows():
    osm_todos.append({
        "qid": row.get("qid", None),
        "osm_rel": row["osm_rel"],
        "osm_way": row["osm_way"],
        "todo": f"🌐 Add/Correct website={row['url']} in OSM relation/way"
    })

# --- 3. Export TODOs ---
wikidata_todo_df = pd.DataFrame(wikidata_todos)
osm_todo_df = pd.DataFrame(osm_todos)

wikidata_todo_df.to_csv("wikidata_todo.csv", index=False)
osm_todo_df.to_csv("osm_todo.csv", index=False)

print("Wikidata TODOs:")
print(wikidata_todo_df.head(20))
print("\nOSM TODOs:")
print(osm_todo_df.head(20))



Wikidata TODOs:
                                                  url  \
0   https://www.lansstyrelsen.se/stockholm/besoksm...   
1   https://www.lansstyrelsen.se/stockholm/besoksm...   
2   https://www.lansstyrelsen.se/stockholm/besoksm...   
3   https://www.lansstyrelsen.se/stockholm/besoksm...   
4   https://www.lansstyrelsen.se/stockholm/besoksm...   
5   https://www.lansstyrelsen.se/stockholm/besoksm...   
6   https://www.lansstyrelsen.se/stockholm/besoksm...   
7   https://www.lansstyrelsen.se/stockholm/besoksm...   
8   https://www.lansstyrelsen.se/stockholm/besoksm...   
9   https://www.lansstyrelsen.se/stockholm/besoksm...   
10  https://www.lansstyrelsen.se/stockholm/besoksm...   
11  https://www.lansstyrelsen.se/stockholm/besoksm...   
12  https://www.lansstyrelsen.se/stockholm/besoksm...   
13  https://www.lansstyrelsen.se/stockholm/besoksm...   
14  https://www.lansstyrelsen.se/stockholm/besoksm...   
15  https://www.lansstyrelsen.se/stockholm/besoksm...   
16  https://www

In [17]:
# Add QuickStatements column for Wikidata fixes
def make_qs(row):
    if pd.isna(row["qid"]):  
        # Item missing entirely → can’t build QS (needs new item creation)
        return f'CREATE\nLAST|P31|Q179049\nLAST|P131|Q104231\nLAST|P856|"{row["url"]}"'
    else:
        # Item exists, just add missing website
        return f'{row["qid"]}|P856|"{row["url"]}"'

wikidata_todos = []

# Case 1: Missing in Wikidata
missing_wd = merged[~merged["in_wikidata"]]
for _, row in missing_wd.iterrows():
    wikidata_todos.append({
        "url": row["url"],
        "qid": None,
        "todo": f"❌ Not in Wikidata → Create item",
        "quickstatement": make_qs(row)
    })

# Case 2: In Wikidata but missing OSM ref
missing_osm = merged[merged["in_wikidata"] & merged["osm_rel"].isna() & merged["osm_way"].isna()]
for _, row in missing_osm.iterrows():
    wikidata_todos.append({
        "url": row["url"],
        "qid": row["qid"],
        "todo": f"⚠️ Missing OSM reference → add P402/P10689",
        "quickstatement": None  # QS not possible automatically
    })

wikidata_todo_df = pd.DataFrame(wikidata_todos)
wikidata_todo_df.to_csv("wikidata_todo.csv", index=False)

print("Wikidata TODOs with QuickStatements:")
print(wikidata_todo_df.head(20))


Wikidata TODOs with QuickStatements:
                                                  url   qid  \
0   https://www.lansstyrelsen.se/stockholm/besoksm...  None   
1   https://www.lansstyrelsen.se/stockholm/besoksm...  None   
2   https://www.lansstyrelsen.se/stockholm/besoksm...  None   
3   https://www.lansstyrelsen.se/stockholm/besoksm...  None   
4   https://www.lansstyrelsen.se/stockholm/besoksm...  None   
5   https://www.lansstyrelsen.se/stockholm/besoksm...  None   
6   https://www.lansstyrelsen.se/stockholm/besoksm...  None   
7   https://www.lansstyrelsen.se/stockholm/besoksm...  None   
8   https://www.lansstyrelsen.se/stockholm/besoksm...  None   
9   https://www.lansstyrelsen.se/stockholm/besoksm...  None   
10  https://www.lansstyrelsen.se/stockholm/besoksm...  None   
11  https://www.lansstyrelsen.se/stockholm/besoksm...  None   
12  https://www.lansstyrelsen.se/stockholm/besoksm...  None   
13  https://www.lansstyrelsen.se/stockholm/besoksm...  None   
14  https://www.la