* [#4](https://github.com/salgo60/SCB-Wikidata/issues/4)
* denna [GetAllLinks.ipynb](https://github.com/salgo60/SCB-Wikidata/blob/main/notebook/GetSCBLinks.ipynb)

Tanken är att kolla alla Wikipedia domäner för alla svenska myndigheter

In [20]:
import time

from datetime import datetime

now = datetime.now()
timestamp = now.timestamp()

start_time = time.time()
print("Start:", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

Start: 2025-11-18 13:34:55


In [21]:
def read_language_codes(file_path): 
    print(f"[DEBUG] Reading language codes from: {file_path}") 
    df = pd.read_csv(file_path) 
    lang_codes = df['Language Code'].dropna().unique().tolist() 
    print(f"[DEBUG] Found {len(lang_codes)} language codes.") 
    return lang_codes 
    
def read_domains(file_path): 
    print(f"[DEBUG] Reading domains from: {file_path}") 
    df = pd.read_csv(file_path, header=None, names=['domain']) 
    domains_list = df['domain'].dropna().unique().tolist() 
    print(f"[DEBUG] Found {len(domains_list)} domains.") 
    return domains_list

In [3]:
import requests

def fetch_sitematrix():
    url = "https://meta.wikimedia.org/w/api.php"
    params = {
        "action": "sitematrix",
        "format": "json"
    }
    headers = {
        "User-Agent": "salgo60-language-fetcher/1.0 (salgo60@msn.com)"
    }

    print("[DEBUG] Fetching sitematrix…")
    r = requests.get(url, params=params, headers=headers)

    # Debug om det fortfarande inte är JSON
    if "application/json" not in r.headers.get("Content-Type", ""):
        print("[ERROR] Server returned non-JSON response:")
        print(r.text[:500])
        raise ValueError("Server returned non-JSON")

    return r.json()

data = fetch_sitematrix()
print("OK! Keys:", data.keys())


[DEBUG] Fetching sitematrix…
OK! Keys: dict_keys(['sitematrix'])


In [10]:
import requests

def debug_sitematrix():
    url = "https://meta.wikimedia.org/w/api.php"
    params = {
        "action": "sitematrix",
        "format": "json"
    }
    headers = {
        "User-Agent": "salgo60-language-fetcher/1.0 (https://github.com/salgo60; contact=salgo60@msn.com)"
    }

    print("[DEBUG] Requesting…")
    r = requests.get(url, params=params, headers=headers)

    print("[DEBUG] Status:", r.status_code)
    print("[DEBUG] Content-Type:", r.headers.get("Content-Type"))
    print("\n---------- RESPONSE START ----------\n")
    print(r.text[:5000])  # skriv ut upp till 5000 tecken
    print("\n---------- RESPONSE END ----------\n")

#debug_sitematrix()


In [22]:
import os

# Get the current working directory
current_directory = os.getcwd()
print("Current Working Directory:", current_directory)



Current Working Directory: /Users/salgo/Documents/GitHub/SCB-Wikidata/notebook


In [5]:
import requests
import pandas as pd


HEADERS = {
    "User-Agent": "salgo60-language-fetcher/2.0 (https://github.com/salgo60)"
}


def fetch_sitematrix():
    """Hämtar sitematrix med korrekt User-Agent."""
    url = "https://meta.wikimedia.org/w/api.php"
    params = {
        "action": "sitematrix",
        "format": "json"
    }

    r = requests.get(url, params=params, headers=HEADERS)

    if "application/json" not in r.headers.get("Content-Type", ""):
        raise ValueError("Non-JSON response:\n" + r.text[:500])

    return r.json()





In [23]:
file_path_lang = "sources/wiki_versions_all.csv"

lang_list = read_language_codes(file_path_lang)
df_lang = pd.DataFrame(lang_list)
df_lang.head() 



[DEBUG] Reading language codes from: sources/wiki_versions_all.csv
[DEBUG] Found 323 language codes.


Unnamed: 0,0
0,ab
1,ace
2,ady
3,af
4,sq


In [24]:
from urllib.parse import urlparse

def normalize_domain(url):
    if not isinstance(url, str) or url.strip() == "":
        return None
    
    # Parse URL
    p = urlparse(url)
    
    # Extract hostname (www.xxx.se)
    host = p.netloc
    
    # Remove 'www.' prefix if you want (optional)
    host = host.replace("www.", "")
    
    return host


In [25]:
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON

# --- 1. Load language list ---
file_path_lang = "sources/wiki_versions_all.csv"
lang_list = read_language_codes(file_path_lang)       # ['sv', 'en', 'de', ...]
df_lang = pd.DataFrame(lang_list, columns=["lang"])


# --- 2. Fetch Swedish government agencies from Wikidata ---
def fetch_swedish_agencies():
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    query = """
     SELECT ?qid ?qidLabel ?website WHERE {
      ?qid wdt:P31 wd:Q68295960 .     # instance of: Swedish government agency
      OPTIONAL { ?qid wdt:P856 ?website }  # website
      SERVICE wikibase:label { bd:serviceParam wikibase:language "sv,en". }
    } order by ?qidLabel

    """
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    rows = []
    for r in results["results"]["bindings"]:
        rows.append({
            "qid": r["qid"]["value"].split("/")[-1],
            "agency_label": r["qidLabel"]["value"],
            "domain": r.get("website", {}).get("value", None)
        })
    return pd.DataFrame(rows)

df_agencies = fetch_swedish_agencies()
df_agencies["domain_clean"] = df_agencies["domain"].apply(normalize_domain)



[DEBUG] Reading language codes from: sources/wiki_versions_all.csv
[DEBUG] Found 323 language codes.


In [26]:
df_agencies 

Unnamed: 0,qid,agency_label,domain,domain_clean
0,Q104870569,Alkoholinspektionen,,
1,Q10405389,Alkoholsortimentsnämnden,https://www.kammarkollegiet.se/om-oss/organisa...,kammarkollegiet.se
2,Q127946640,Allmänna advokatbyråerna,,
3,Q6971047,Allmänna reklamationsnämnden,http://www.arn.se/,arn.se
4,Q10413037,Ansvarsnämnden för djurens hälso- och sjukvård,https://www.vetansvar.se,vetansvar.se
...,...,...,...,...
402,Q10728611,Överklagandenämnden för högskolan,https://www.onh.se,onh.se
403,Q80207153,Överklagandenämnden för nämndemannauppdrag,https://www.domstol.se/overklagandenamnden-for...,domstol.se
404,Q10728612,Överklagandenämnden för studiestöd,https://www.oks.se,oks.se
405,Q30696695,Överstyrelsen för yrkesutbildning,,


In [36]:
session = requests.Session()
session.headers.update({
    "User-Agent": "SAT-link-checker/1.0 (https://github.com/salgo60; contact: salgo60@msn.com)"
})


In [None]:
import requests
import time
import pandas as pd
from urllib.parse import urlparse
from SPARQLWrapper import SPARQLWrapper, JSON

session = requests.Session()
session.headers.update({
    "User-Agent": "SAT-link-checker/1.0 (https://github.com/salgo60; contact: salgo60)"
})

# -----------------------------------------------------------
# STEP 1 — Get all valid Wikipedia languages from Wikidata
# -----------------------------------------------------------

def get_wikipedia_languages():
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    query = """
    SELECT ?code WHERE {
      ?wiki a wikibase:Site ;
            wikibase:wikiGroup "wikipedia" ;
            wikibase:language ?code .
    }
    ORDER BY ?code
    """
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    data = sparql.query().convert()

    return [x["code"]["value"] for x in data["results"]["bindings"]]


lang_list = get_wikipedia_languages()
print("Wikipedia languages:", len(lang_list))
print(lang_list[:20], "...")

# -----------------------------------------------------------
# STEP 2 — Normalize domain
# -----------------------------------------------------------

def normalize_domain(url):
    if not isinstance(url, str) or url.strip() == "":
        return None
    p = urlparse(url)
    host = p.netloc.replace("www.", "")
    return host.split("/")[0]

df_agencies["domain_clean"] = df_agencies["domain"].apply(normalize_domain)

# -----------------------------------------------------------
# STEP 3 — Test if a Wikipedia language works (API endpoint)
# -----------------------------------------------------------

def is_valid_wiki(lang):
    url = f"https://{lang}.wikipedia.org/w/api.php?action=query&meta=siteinfo&format=json"
    try:
        r = session.get(url, timeout=5)
        if r.status_code != 200:
            return False
        data = r.json()
        return "query" in data
    except:
        return False

valid_langs = [lang for lang in lang_list if is_valid_wiki(lang)]

print("Valid Wikipedia languages:", len(valid_langs))
print(valid_langs[:25], "...")

# -----------------------------------------------------------
# STEP 4 — Fetch exturlusage entries for one lang/domain
# -----------------------------------------------------------

def fetch_exturlusage(lang, domain):
    base = f"https://{lang}.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "list": "exturlusage",
        "euquery": domain,
        "eulimit": "max"
    }

    while True:
        try:
            r = session.get(base, params=params, timeout=10)
        except Exception as e:
            print(f"[ERROR] Request failed for {lang}: {e}")
            return

        if r.status_code == 403:
            print(f"[403] Forbidden for {lang} / {domain}")
            return

        try:
            data = r.json()
        except ValueError:
            print(f"[WARN] Non-JSON response for {lang} / {domain}")
            return

        for item in data.get("query", {}).get("exturlusage", []):
            yield {
                "lang": lang,
                "page_title": item.get("title"),
                "url": item.get("url"),
                "wiki_link": f"https://{lang}.wikipedia.org/wiki/{item.get('title').replace(' ', '_')}"
            }

        if "continue" not in data:
            break

        params.update(data["continue"])
        time.sleep(0.3)

# -----------------------------------------------------------
# STEP 5 — Loop agencies + languages
# -----------------------------------------------------------

def get_all_outlinks(df_agencies, lang_list):
    rows = []

    for _, row in df_agencies.iterrows():
        qid = row["qid"]
        label = row["agency_label"]
        domain_clean = row["domain_clean"]

        if not isinstance(domain_clean, str) or domain_clean.strip() == "":
            continue

        print(f"\nChecking {qid} / {label} / {domain_clean}")

        for lang in lang_list:
            try:
                for hit in fetch_exturlusage(lang, domain_clean):
                    rows.append({
                        "qid": qid,
                        "agency_label": label,
                        "domain": domain_clean,
                        "lang": hit["lang"],
                        "page_title": hit["page_title"],
                        "wiki_link": hit["wiki_link"],
                        "url": hit["url"]
                    })
            except Exception as e:
                print(f"[ERROR] {domain_clean} @ {lang}: {e}")
                continue

    return pd.DataFrame(rows)

# -----------------------------------------------------------
# STEP 6 — RUN
# -----------------------------------------------------------

df_outlinks = get_all_outlinks(df_agencies, valid_langs)
df_outlinks.head()
