* #55 
* [#55](https://github.com/salgo60/SCB-Wikidata/issues/55)
* Notebook [Linkroot_all_55.ipynb](http://localhost:8888/notebooks/SCB-Wikidata/notebook/Linkroot_all_55.ipynb)

In [1]:
import time

from datetime import datetime

now = datetime.now()
timestamp = now.timestamp()

start_time = time.time()
print("Start:", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

Start: 2026-01-10 15:43:38


In [2]:
import os

# Get the current working directory
current_directory = os.getcwd()
print("Current Working Directory:", current_directory)



Current Working Directory: /Users/salgo/Documents/GitHub/SCB-Wikidata/notebook


In [4]:
import time

from datetime import datetime

now = datetime.now()
timestamp = now.timestamp()

start_time = time.time()
print("Start:", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

Start: 2026-01-10 15:42:01


In [3]:

SCRIPT_NAME = "Linkroot_all_55.ipynb"
SCRIPT_URL = (
    "https://github.com/salgo60/SCB-Wikidata/"
    "blob/master/notebook/Linkroot_all_55.ipynb"
) 


In [7]:
import os

# Get the current working directory
current_directory = os.getcwd()
print("Current Working Directory:", current_directory)



Current Working Directory: /Users/salgo/Documents/GitHub/SCB-Wikidata/notebook


In [5]:
import requests

def fetch_sitematrix_df():
    url = "https://meta.wikimedia.org/w/api.php"
    params = {
        "action": "sitematrix",
        "format": "json"
    }
    headers = {
        "User-Agent": "salgo60-language-fetcher/1.0 (salgo60@msn.com)"
    }

    print("[DEBUG] Fetching sitematrix‚Ä¶")
    r = requests.get(url, params=params, headers=headers)
    r.raise_for_status()

    if "application/json" not in r.headers.get("Content-Type", ""):
        raise ValueError("Server returned non-JSON response")

    data = r.json()["sitematrix"]

    rows = []

    # --- language-specific sites ---
    for key, lang_block in data.items():
        if not key.isdigit():
            continue  # skip "count", "specials"

        lang_code = lang_block.get("code")
        lang_name = lang_block.get("name")

        for site in lang_block.get("site", []):
            rows.append({
                "lang_code": lang_code,
                "lang_name": lang_name,
                "project": site.get("project"),
                "url": site.get("url"),
                "dbname": site.get("dbname"),
                "site_name": site.get("sitename"),
                "closed": site.get("closed", False)
            })

    # --- special wikis (Wikidata, Commons, Meta, etc.) ---
    for site in data.get("specials", []):
        rows.append({
            "lang_code": "special",
            "lang_name": "special",
            "project": site.get("project"),
            "url": site.get("url"),
            "dbname": site.get("dbname"),
            "site_name": site.get("sitename"),
            "closed": site.get("closed", False)
        })

    return pd.DataFrame(rows)


In [6]:
import requests
import pandas as pd


HEADERS = {
    "User-Agent": "salgo60-language-fetcher/2.0 (https://github.com/salgo60)"
}


df_lang_fetch = fetch_sitematrix_df()
df_lang_fetch["closed"] = df_lang_fetch["closed"].fillna(False).astype(bool)

df_lang_wikipedia = df_lang_filtered = df_lang_fetch[
    (df_lang_fetch["site_name"] == "Wikipedia") &
    (df_lang_fetch["lang_name"].str.lower() != "special")]

#df_lang_wikipedia.to_csv("test.csv")
df_lang_wikipedia.info()

[DEBUG] Fetching sitematrix‚Ä¶
<class 'pandas.core.frame.DataFrame'>
Index: 184 entries, 0 to 937
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   lang_code  184 non-null    object
 1   lang_name  183 non-null    object
 2   project    0 non-null      object
 3   url        184 non-null    object
 4   dbname     184 non-null    object
 5   site_name  184 non-null    object
 6   closed     184 non-null    bool  
dtypes: bool(1), object(6)
memory usage: 10.2+ KB


In [20]:
import requests
import pandas as pd
import re

WIKIDATA_ENDPOINT = "https://query.wikidata.org/sparql"

SPARQL = """
SELECT ?k ?kLabel ?web ?insLabel WHERE {
  VALUES ?ins { wd:Q16296141 wd:Q127448 wd:Q68295960 wd:Q107407151 }
  ?k wdt:P31 ?ins .
  ?k wdt:P17 wd:Q34 .
  OPTIONAL { ?k wdt:P856 ?web }
  MINUS { ?k wdt:P576 ?diss }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "sv,en,mul". }
}
ORDER BY ?kLabel
"""

def extract_se_domain(url):
    if not url:
        return None

    url = url.lower()

    # remove protocol
    url = re.sub(r"^https?://", "", url)

    # remove path, params, fragments
    url = url.split("/")[0]

    # remove www.
    url = re.sub(r"^www\.", "", url)

    # keep only xxxx.se
    m = re.search(r"([a-z0-9\-]+\.se)$", url)
    if m:
        return m.group(1)

    return None


In [22]:
def read_domains_from_wikidata():
    headers = {
        "Accept": "application/sparql-results+json",
        "User-Agent": "ESV-domain-audit/1.0 (https://github.com/salgo60)"
    }

    r = requests.get(
        WIKIDATA_ENDPOINT,
        params={"query": SPARQL, "format": "json"},
        headers=headers,
        timeout=60
    )

    if r.status_code != 200 or not r.text.strip().startswith("{"):
        raise RuntimeError("Wikidata did not return JSON:\n" + r.text[:300])

    data = r.json()

    rows = []
    for b in data["results"]["bindings"]:
        website = b.get("web", {}).get("value")

        rows.append({
            "wikidata": b["k"]["value"],
            "label": b["kLabel"]["value"],
            "type": b["insLabel"]["value"],
            "website": website,
            "domain_se": extract_se_domain(website)
        })

    df = pd.DataFrame(rows)

    # Remove rows without a usable .se domain
    df = df[df["domain_se"].notna()]

    # Deduplicate on domain
    df = df.drop_duplicates(subset="domain_se").sort_values("domain_se")

    return df



In [25]:
domains = read_domains_from_wikidata()
domains.info()


<class 'pandas.core.frame.DataFrame'>
Index: 505 entries, 565 to 559
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   wikidata   505 non-null    object
 1   label      505 non-null    object
 2   type       505 non-null    object
 3   website    505 non-null    object
 4   domain_se  505 non-null    object
dtypes: object(5)
memory usage: 23.7+ KB


In [37]:
session = requests.Session()
session.headers.update({
    "User-Agent": "SCB-LinkAudit/1.0 salgo60@msn.com)"
})


In [38]:
domains.head()


Unnamed: 0,wikidata,label,type,website,domain_se
565,http://www.wikidata.org/entity/Q3406485,√Öklagarmyndigheten,svensk statlig myndighet,http://www.aklagare.se,aklagare.se
0,http://www.wikidata.org/entity/Q498470,Ale kommun,kommun,http://www.ale.se,ale.se
1,http://www.wikidata.org/entity/Q503162,Alings√•s kommun,kommun,https://www.alingsas.se/,alingsas.se
560,http://www.wikidata.org/entity/Q254799,√Ñlmhults kommun,kommun,http://www.almhult.se,almhult.se
561,http://www.wikidata.org/entity/Q123855,√Ñlvdalens kommun,kommun,http://www.alvdalen.se,alvdalen.se


In [39]:
authority_domains = set(domains["domain_se"])

print("Antal myndighetsdom√§ner:", len(authority_domains))

Antal myndighetsdom√§ner: 505


In [42]:
authority_domains

{'aklagare.se',
 'ale.se',
 'alingsas.se',
 'almhult.se',
 'alvdalen.se',
 'alvesta.se',
 'alvkarleby.se',
 'alvsbyn.se',
 'amal.se',
 'aneby.se',
 'ange.se',
 'angelholm.se',
 'arbetsformedlingen.se',
 'arbetsgivarverket.se',
 'arboga.se',
 'are.se',
 'arjang.se',
 'arjeplog.se',
 'arkdes.se',
 'arn.se',
 'arvidsjaur.se',
 'arvika.se',
 'arvsfonden.se',
 'asele.se',
 'askersund.se',
 'astorp.se',
 'atvidaberg.se',
 'av.se',
 'avesta.se',
 'barnombudsmannen.se',
 'bastad.se',
 'bengtsfors.se',
 'berg.se',
 'bfn.se',
 'bjurholm.se',
 'bjuv.se',
 'boden.se',
 'bolagsverket.se',
 'bollebygd.se',
 'bollnas.se',
 'boras.se',
 'borgholm.se',
 'borlange.se',
 'botkyrka.se',
 'boverket.se',
 'boxholm.se',
 'bra.se',
 'bracke.se',
 'bromolla.se',
 'brottsoffermyndigheten.se',
 'bth.se',
 'burlov.se',
 'cdfn.se',
 'csn.se',
 'dalsed.se',
 'danderyd.se',
 'degerfors.se',
 'digg.se',
 'do.se',
 'domstol.se',
 'dorotea.se',
 'du.se',
 'eda.se',
 'ehalsomyndigheten.se',
 'ei.se',
 'ekero.se',
 'ekn.

In [40]:
import time
import re
import requests

def normalize_url_domain(url):
    url = url.lower()
    url = re.sub(r"^https?://", "", url)
    url = url.split("/")[0]
    url = re.sub(r"^www\.", "", url)
    return url

def fetch_exturlusage(lang, authority_domains, sleep=0.5):
    api = f"https://{lang}.wikipedia.org/w/api.php"

    params = {
        "action": "query",
        "list": "exturlusage",
        "euquery": ".se",
        "eulimit": "500",
        "format": "json"
    }

    while True:
        try:
            r = session.get(api, params=params, timeout=30)

            # HTTP-level failure
            if r.status_code != 200:
                print(f"[{lang}] HTTP {r.status_code}")
                break

            # JSON missing ‚Üí HTML or empty response
            if not r.text.strip().startswith("{"):
                print(f"[{lang}] Non-JSON response (rate limit or HTML)")
                break

            data = r.json()

        except Exception as e:
            print(f"[{lang}] API error: {e}")
            break

        for e in data.get("query", {}).get("exturlusage", []):
            raw_url = e["url"]
            domain = normalize_url_domain(raw_url)

            if domain in authority_domains:
                yield {
                    "lang": lang,
                    "pageid": e["pageid"],
                    "title": e["title"],
                    "url": raw_url,
                    "domain": domain
                }

        if "continue" not in data:
            break

        params.update(data["continue"])
        time.sleep(sleep)


In [43]:
print("Antal spr√•k:", len(df_lang_wikipedia))

results = []

for i, row in df_lang_wikipedia.iterrows():
    lang = row["lang_code"]
    lang_name = row["lang_name"]

    print(f"[{i+1}/{len(df_lang_wikipedia)}] {lang} ‚Äì {lang_name}")

    before = len(results)

    try:
        for entry in fetch_exturlusage(lang, authority_domains):
            results.append(entry)
    except Exception as e:
        print(f"  {lang} aborted: {e}")
        continue

    links = len(results) - before
    print(f"  {lang}: {links} myndighetsl√§nkar",i,row)


Antal spr√•k: 184
[1/184] aa ‚Äì Qaf√°r af
  aa: 0 myndighetsl√§nkar 0 lang_code                          aa
lang_name                    Qaf√°r af
project                          None
url          https://aa.wikipedia.org
dbname                         aawiki
site_name                   Wikipedia
closed                          False
Name: 0, dtype: object
[6/184] ace ‚Äì Ac√®h
  ace: 0 myndighetsl√§nkar 5 lang_code                          ace
lang_name                         Ac√®h
project                           None
url          https://ace.wikipedia.org
dbname                         acewiki
site_name                    Wikipedia
closed                           False
Name: 5, dtype: object
[8/184] af ‚Äì Afrikaans
  af: 0 myndighetsl√§nkar 7 lang_code                          af
lang_name                   Afrikaans
project                          None
url          https://af.wikipedia.org
dbname                         afwiki
site_name                   Wikipedia
closed    