ESV.com omdireigeras till Statskontoret.se 

* [#51](https://github.com/salgo60/SCB-Wikidata/issues/51)

In [1]:
import time

from datetime import datetime

now = datetime.now()
timestamp = now.timestamp()

start_time = time.time()
print("Start:", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

Start: 2026-01-09 13:29:21


In [2]:

SCRIPT_NAME = "ESV_51.ipynb.ipynb"
SCRIPT_URL = (
    "https://github.com/salgo60/SCB-Wikidata/"
    "blob/master/notebook/ESV_51.ipynb.ipynb"
) 


In [3]:
def read_domains(file_path):
    print(f"[DEBUG] Reading domains from: {file_path}")
    df = pd.read_csv(file_path, header=0)   # <- skip header row
    domains_list = df.iloc[:, 0].dropna().unique().tolist()
    print(f"[DEBUG] Found {len(domains_list)} domains.")
    return domains_list


In [4]:
import requests

def fetch_sitematrix_df():
    url = "https://meta.wikimedia.org/w/api.php"
    params = {
        "action": "sitematrix",
        "format": "json"
    }
    headers = {
        "User-Agent": "salgo60-language-fetcher/1.0 (salgo60@msn.com)"
    }

    print("[DEBUG] Fetching sitematrix‚Ä¶")
    r = requests.get(url, params=params, headers=headers)
    r.raise_for_status()

    if "application/json" not in r.headers.get("Content-Type", ""):
        raise ValueError("Server returned non-JSON response")

    data = r.json()["sitematrix"]

    rows = []

    # --- language-specific sites ---
    for key, lang_block in data.items():
        if not key.isdigit():
            continue  # skip "count", "specials"

        lang_code = lang_block.get("code")
        lang_name = lang_block.get("name")

        for site in lang_block.get("site", []):
            rows.append({
                "lang_code": lang_code,
                "lang_name": lang_name,
                "project": site.get("project"),
                "url": site.get("url"),
                "dbname": site.get("dbname"),
                "site_name": site.get("sitename"),
                "closed": site.get("closed", False)
            })

    # --- special wikis (Wikidata, Commons, Meta, etc.) ---
    for site in data.get("specials", []):
        rows.append({
            "lang_code": "special",
            "lang_name": "special",
            "project": site.get("project"),
            "url": site.get("url"),
            "dbname": site.get("dbname"),
            "site_name": site.get("sitename"),
            "closed": site.get("closed", False)
        })

    return pd.DataFrame(rows)


In [5]:
import requests
import pandas as pd


HEADERS = {
    "User-Agent": "salgo60-language-fetcher/2.0 (https://github.com/salgo60)"
}


df_lang_fetch = fetch_sitematrix_df()
df_lang_fetch["closed"] = df_lang_fetch["closed"].fillna(False).astype(bool)

df_lang_wikipedia = df_lang_filtered = df_lang_fetch[
    (df_lang_fetch["site_name"] == "Wikipedia") &
    (df_lang_fetch["lang_name"].str.lower() != "special")]

#df_lang_wikipedia.to_csv("test.csv")
df_lang_wikipedia.info()

[DEBUG] Fetching sitematrix‚Ä¶
<class 'pandas.core.frame.DataFrame'>
Index: 184 entries, 0 to 937
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   lang_code  184 non-null    object
 1   lang_name  183 non-null    object
 2   project    0 non-null      object
 3   url        184 non-null    object
 4   dbname     184 non-null    object
 5   site_name  184 non-null    object
 6   closed     184 non-null    bool  
dtypes: bool(1), object(6)
memory usage: 10.2+ KB


In [6]:
import os

# Get the current working directory
current_directory = os.getcwd()
print("Current Working Directory:", current_directory)



Current Working Directory: /Users/salgo/Documents/GitHub/SCB-Wikidata/notebook


In [7]:
import os
import time
import random
import requests
import pandas as pd
from urllib.parse import urlparse
from tqdm.notebook import tqdm
file_path_domain = "sources/domains_ESV.csv"
domains = read_domains(file_path_domain)
print(domains)


[DEBUG] Reading domains from: sources/domains_ESV.csv
[DEBUG] Found 1 domains.
['esv.se']


In [9]:
# -----------------------------------------------------------
# Fetch exturlusage entries for one lang/domain
# -----------------------------------------------------------
def fetch_exturlusage(lang, domain):
    base = f"https://{lang}.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "list": "exturlusage",
        "euquery": domain,
        "eulimit": "max"
    }
    while True:
        r = session.get(base, params=params, timeout=10)
        try:
            data = r.json()
        except ValueError:
            print(f"[WARN] {lang}: JSON decode failed")
            break

        for item in data.get("query", {}).get("exturlusage", []):
            yield {
                "lang": lang,
                "page_title": item.get("title"),
                "url": item.get("url"),
                "wiki_link": f"https://{lang}.wikipedia.org/wiki/{item.get('title').replace(' ', '_')}"
            }

        if "continue" not in data:
            break
        params.update(data["continue"])
        time.sleep(0.3)

In [10]:

# -------------------------
# Session & helpers
# -------------------------
session = requests.Session()
session.headers.update({"User-Agent": "SCB-LinkAudit/1.0 (https://www.scb.se/)"})

# we need some filtering  

print("Antal Spr√•k:",len(df_lang_wikipedia ))
results = []
for _, row in df_lang_wikipedia.iterrows():
    lang = row["lang_code"]
    url  = row["url"]
    lang_name = row["lang_name"]
    before = len(results)
    #print(lang, url, lang_name,domains)
    for entry in fetch_exturlusage(lang, domains):
        results.append(entry)     
    after = len(results) 
    links = after-before
    print(lang, url, lang_name," - ", links)
    


Antal Spr√•k: 184
aa https://aa.wikipedia.org Qaf√°r af  -  0
ace https://ace.wikipedia.org Ac√®h  -  0
af https://af.wikipedia.org Afrikaans  -  0
ak https://ak.wikipedia.org None  -  0
ami https://ami.wikipedia.org Pangcah  -  0
an https://an.wikipedia.org aragon√©s  -  0
ast https://ast.wikipedia.org asturianu  -  0
av https://av.wikipedia.org –∞–≤–∞—Ä  -  0
avk https://avk.wikipedia.org Kotava  -  0
ay https://ay.wikipedia.org Aymar aru  -  0
bar https://bar.wikipedia.org Boarisch  -  0
bbc https://bbc.wikipedia.org Batak Toba  -  0
bcl https://bcl.wikipedia.org Bikol Central  -  0
bi https://bi.wikipedia.org Bislama  -  0
bm https://bm.wikipedia.org bamanankan  -  0
bo https://bo.wikipedia.org ‡Ωñ‡Ωº‡Ωë‡ºã‡Ω°‡Ω≤‡ΩÇ  -  0
br https://br.wikipedia.org brezhoneg  -  0
bs https://bs.wikipedia.org bosanski  -  0
btm https://btm.wikipedia.org Batak Mandailing  -  0
bug https://bug.wikipedia.org Basa Ugi  -  0
bxr https://bxr.wikipedia.org –±—É—Ä—è–∞–¥  -  0
cbk-zam https://cbk-zam.wikipe

In [11]:
df_esv = pd.DataFrame(results)
df_esv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154 entries, 0 to 153
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   lang        154 non-null    object
 1   page_title  154 non-null    object
 2   url         154 non-null    object
 3   wiki_link   154 non-null    object
dtypes: object(4)
memory usage: 4.9+ KB


In [12]:
import pandas as pd

# --- Stats ---
total_links = len(df_esv)
total_unique_links = df_esv['url'].nunique()
num_languages = df_esv['lang'].nunique()
langs_sorted = df_esv['lang'].value_counts()

print("Total links:", total_links)
print("Total unique links:", total_unique_links)
print("Number of languages:", num_languages)
print("\nLanguages with most links:")
print(langs_sorted.to_string())


Total links: 154
Total unique links: 111
Number of languages: 10

Languages with most links:
lang
sv    110
en     32
no      4
sq      2
da      1
de      1
fi      1
it      1
ms      1
zh      1


In [13]:
# --- Stats ---
total_links = len(df_esv)
total_unique_links = df_esv["url"].nunique()
langs_with_hits = sorted(df_esv["lang"].unique())

num_languages_found = len(langs_with_hits)
num_languages_checked = len(df_lang_wikipedia)        # alla spr√•k som genoms√∂ktes
num_languages_found = df_esv['lang'].nunique()


In [14]:
num_languages_checked 

184

In [15]:
import requests
from requests.exceptions import RequestException

SOFT_404_PHRASES = [
    "Sidan kan inte hittas",
    "sidan tagits bort",
    "felaktig adress",
    "kontakta registrator",
    "Regeringskansliets arkiv",
]

ROOT_CANONICAL = "https://www.esv.se"


def norm(u):
    return u.rstrip("/").lower()


def check_url(url, timeout=15):
    try:
        r = requests.get(
            url,
            allow_redirects=True,
            timeout=timeout,
            headers={
                "User-Agent": "LinkChecker/1.0 (research; salgo60@msn.com)"
            }
        )
    except RequestException as e:
        return {
            "url": url,
            "status": "error",
            "reason": str(e),
        }

    final_url = r.url
    status_code = r.status_code
    text = r.text or ""

    # 1. Hard HTTP error
    if status_code >= 400:
        return {
            "url": url,
            "status": "dead",
            "reason": f"HTTP {status_code}",
            "final_url": final_url,
        }

    # 2. Redirected to site root (content missing)
    if norm(final_url) == norm(ROOT_CANONICAL) and norm(url) != norm(final_url):
        return {
            "url": url,
            "status": "dead",
            "reason": "redirect_to_root",
            "final_url": final_url,
        }

    # 3. Soft 404 detection
    lowered = text.lower()
    for phrase in SOFT_404_PHRASES:
        if phrase.lower() in lowered:
            return {
                "url": url,
                "status": "dead",
                "reason": "soft_404",
                "final_url": final_url,
            }

    # 4. Otherwise OK
    return {
        "url": url,
        "status": "ok",
        "final_url": final_url,
    }


In [16]:
def check_internet_archive(url, timeout=10):
    api = "https://archive.org/wayback/available"
    try:
        r = requests.get(
            api,
            params={"url": url},
            timeout=timeout,
            headers={"User-Agent": "LinkChecker/1.0"}
        )
        data = r.json()
    except Exception:
        return None

    snap = data.get("archived_snapshots", {}).get("closest")
    if snap and snap.get("available"):
        return snap.get("url")

    return None


In [17]:
df_esv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154 entries, 0 to 153
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   lang        154 non-null    object
 1   page_title  154 non-null    object
 2   url         154 non-null    object
 3   wiki_link   154 non-null    object
dtypes: object(4)
memory usage: 4.9+ KB


In [19]:
from tqdm import tqdm  
import time
results = []
checked = set() 
urls = df_esv["url"].dropna().astype(str).unique()
for url in tqdm(
    urls,
    total=len(urls),
    desc="Checking esv.se links",
    unit="link",
):
    #print("url:", url)
    if url in checked:
        continue
    result = check_url(url)  
    ia_url = check_internet_archive(url)
    result["ia_url"] = ia_url
    result["ia_status"] = "available" if ia_url else "missing"
    results.append(result)
    checked.add(url)
    time.sleep(0.3)  


Checking esv.se links: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 111/111 [04:18<00:00,  2.33s/link]


In [20]:
# HARD RESET
df_results = None

print("len(results):", len(results))

df_results = pd.DataFrame(results)

print("df_results.shape:", df_results.shape)
print(df_results.head(2))
print(df_results.tail(2))


len(results): 111
df_results.shape: (111, 6)
                                                 url status  \
0  https://www.esv.se/statsliggaren/regleringsbre...     ok   
1  https://www.esv.se/statsliggaren/regleringsbre...     ok   

                                           final_url  \
0  https://www.statskontoret.se:443/statsliggaren...   
1  https://www.statskontoret.se:443/statsliggaren...   

                                              ia_url  ia_status reason  
0  http://web.archive.org/web/20250126205729/http...  available    NaN  
1                                               None    missing    NaN  
                                                   url status  \
109  https://www.esv.se/statsliggaren/regleringsbre...     ok   
110  https://www.esv.se/statsliggaren/regleringsbre...     ok   

                                             final_url  \
109  https://www.statskontoret.se:443/statsliggaren...   
110  https://www.statskontoret.se:443/statsliggaren...   

      

In [21]:
print("len(results):", len(results))
print("first:", results[0])
print("last:", results[-1])

len(results): 111
first: {'url': 'https://www.esv.se/statsliggaren/regleringsbrev/?RBID=13316', 'status': 'ok', 'final_url': 'https://www.statskontoret.se:443/statsliggaren/regleringsbrev/?RBID=13316', 'ia_url': 'http://web.archive.org/web/20250126205729/https://www.esv.se/statsliggaren/regleringsbrev/?RBID=13316', 'ia_status': 'available'}
last: {'url': 'https://www.esv.se/statsliggaren/regleringsbrev/?RBID=17189', 'status': 'ok', 'final_url': 'https://www.statskontoret.se:443/statsliggaren/regleringsbrev/?RBID=17189', 'ia_url': 'http://web.archive.org/web/20240901082304/https://www.esv.se/statsliggaren/regleringsbrev/?rbid=17189', 'ia_status': 'available'}


In [22]:
df_results["status"].value_counts()

status
dead    64
ok      47
Name: count, dtype: int64

In [23]:
(df_results["status"] == "dead").mean()

0.5765765765765766

In [24]:
df_results[
    (df_results["status"] == "dead") &
    (df_results["ia_status"] != "available")
].shape[0]

13

In [25]:
results[:3]

[{'url': 'https://www.esv.se/statsliggaren/regleringsbrev/?RBID=13316',
  'status': 'ok',
  'final_url': 'https://www.statskontoret.se:443/statsliggaren/regleringsbrev/?RBID=13316',
  'ia_url': 'http://web.archive.org/web/20250126205729/https://www.esv.se/statsliggaren/regleringsbrev/?RBID=13316',
  'ia_status': 'available'},
 {'url': 'https://www.esv.se/statsliggaren/regleringsbrev/Index?rbId=24927',
  'status': 'ok',
  'final_url': 'https://www.statskontoret.se:443/statsliggaren/regleringsbrev/Index?rbId=24927',
  'ia_url': None,
  'ia_status': 'missing'},
 {'url': 'http://www.esv.se/aboutnes.4.349a0e10b3c2567f280002057.html',
  'status': 'dead',
  'reason': 'HTTP 404',
  'final_url': 'https://www.statskontoret.se:443/aboutnes.4.349a0e10b3c2567f280002057.html',
  'ia_url': 'http://web.archive.org/web/20120212191553/http://www.esv.se/aboutnes.4.349a0e10b3c2567f280002057.html',
  'ia_status': 'available'}]

In [26]:
#df_results = pd.DataFrame(results)  
df_results = (
    pd.DataFrame(results)
    .drop_duplicates(subset="url", keep="last")
)
df_esv = df_esv.merge(
    df_results,
    on="url",
    how="left",
) 

In [27]:
len(results)

111

In [28]:
status_counts = df_esv["status"].value_counts()

num_ok = status_counts.get("ok", 0)
num_dead = status_counts.get("dead", 0)
num_error = status_counts.get("error", 0)
num_total = len(status_counts)
print( "Ok ",num_ok) 
print( "Dead ",num_dead)
print( "Error ",num_error ) 
print( "Total ",num_total )

Ok  80
Dead  74
Error  0
Total  2


In [29]:
df_esv["reason"].value_counts()

reason
HTTP 404    73
HTTP 401     1
Name: count, dtype: int64

In [30]:
from datetime import date
import os

# S√§tt datum
today = date.today().strftime("%Y_%m_%d")

# Se till att katalogen finns
os.makedirs("resultsESV", exist_ok=True)

# Bygg filnamn
outfile = f"resultsESV/links_ESV_v1_{today}.csv"

# Exportera
df_esv.to_csv(outfile, index=False, encoding="utf-8")

print(f"[OK] Exported {len(df_esv)} rows to {outfile}")


[OK] Exported 154 rows to resultsESV/links_ESV_v1_2026_01_09.csv


In [31]:
lang_stats = (
    df_esv
    .groupby("lang")
    .agg(
        total_links=("url", "count"),
        broken_links=("status", lambda s: (s == "dead").sum()),
        archived_links=("ia_status", lambda s: (s == "available").sum()),
    )
    .reset_index()
)

lang_stats["broken_pct"] = (
    100 * lang_stats["broken_links"] / lang_stats["total_links"]
).round(1)

lang_stats["broken_lost"] = (
    lang_stats["broken_links"] - lang_stats["archived_links"])

top10_langs = (
    lang_stats
    .sort_values("total_links", ascending=False)
    .head(10)
)
top10_langs[
    [
        "lang",
        "total_links",
        "broken_links",
        "broken_pct",
        "archived_links",
        "broken_lost",
    ]
]

Unnamed: 0,lang,total_links,broken_links,broken_pct,archived_links,broken_lost
8,sv,110,43,39.1,91,-48
2,en,32,23,71.9,26,-3
6,no,4,3,75.0,3,0
7,sq,2,2,100.0,2,0
0,da,1,0,0.0,1,-1
1,de,1,0,0.0,0,0
3,fi,1,1,100.0,1,0
4,it,1,1,100.0,0,1
5,ms,1,1,100.0,1,0
9,zh,1,0,0.0,0,0


In [32]:
assert (lang_stats["broken_links"] <= lang_stats["total_links"]).all()
assert (lang_stats["broken_pct"] <= 100).all()


In [34]:
from urllib.parse import urlparse

df = df_esv.copy()

df["domain"] = df["url"].apply(
    lambda u: urlparse(u).netloc.lower() if pd.notna(u) else None
)
domain_stats = (
    df
    .groupby("domain")
    .agg(
        total_links=("url", "count"),
        broken_links=("status", lambda s: (s == "dead").sum()),
        error_links=("status", lambda s: (s == "error").sum()),
    )
    .reset_index()
)
domain_stats["broken_pct"] = (
    100 * domain_stats["broken_links"] / domain_stats["total_links"]
).round(1)

domain_stats["error_pct"] = (
    100 * domain_stats["error_links"] / domain_stats["total_links"]
).round(1)


In [35]:
domain_stats 

Unnamed: 0,domain,total_links,broken_links,error_links,broken_pct,error_pct
0,datalabb.esv.se,1,1,0,100.0,0.0
1,esv.se,6,6,0,100.0,0.0
2,www.esv.se,147,67,0,45.6,0.0


In [36]:
status_counts = df["status"].value_counts()

num_ok = int(status_counts.get("ok", 0))
num_dead = int(status_counts.get("dead", 0))
num_error = int(status_counts.get("error", 0))
num_total = len(df)

pct_ok = round(100 * num_ok / num_total, 1)
pct_dead = round(100 * num_dead / num_total, 1)
pct_error = round(100 * num_error / num_total, 1)

# Broken links: archived vs lost
num_dead_archived = df[
    (df["status"] == "dead") & (df["ia_status"] == "available")
].shape[0]

num_dead_lost = num_dead - num_dead_archived


In [37]:
top_domains = (
    domain_stats[domain_stats["total_links"] >= 5]
    .sort_values("total_links", ascending=False)
    .head(10)
)
domain_stats_html = "<ul>"
for _, r in top_domains.iterrows():
    domain_stats_html += (
        f"<li><strong>{r['domain']}</strong>: "
        f"{r['broken_links']} / {r['total_links']} broken "
        f"({r['broken_pct']}%)</li>"
    )
domain_stats_html += "</ul>"


In [38]:
assert num_ok + num_dead + num_error  == num_total


In [42]:
from pathlib import Path
from datetime import date, datetime
from urllib.parse import quote
import pandas as pd


def save_sortable_html_df_regeringen(
    df,
    out_dir="resultsESV",
    domains=None,
    issue_url="https://github.com/salgo60/SCB-Wikidata/issues/51",
):
    out_dir = Path(out_dir)
    out_dir.mkdir(exist_ok=True)

    today = date.today().strftime("%Y_%m_%d")
    out_path = out_dir / f"links_esv_v1_{today}.html"
    rerun_ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

     # --- F√∂rbered data ---
    df = df.copy()  
    status_counts = df["status"].value_counts()

    num_ok = int(status_counts.get("ok", 0))
    num_dead = int(status_counts.get("dead", 0))
    num_error = int(status_counts.get("error", 0))
    num_total = len(df)
    
    pct_ok = round(100 * num_ok / num_total, 1)
    pct_dead = round(100 * num_dead / num_total, 1)
    pct_error = round(100 * num_error / num_total, 1)
    
    # Broken links: archived vs lost
    num_dead_archived = df[
        (df["status"] == "dead") & (df["ia_status"] == "available")
    ].shape[0]
    
    num_dead_lost = num_dead - num_dead_archived

    domains = domains or []
    from urllib.parse import urlparse
    
    df["domain"] = df["url"].apply(
        lambda u: urlparse(u).netloc.lower() if pd.notna(u) else None
    )

    domain_stats = (
        df
        .groupby("domain")
        .agg(
            total_links=("url", "count"),
            broken_links=("status", lambda s: (s == "dead").sum()),
            error_links=("status", lambda s: (s == "error").sum()),
        )
        .reset_index()
    )
    
    domain_stats["broken_pct"] = (
        100 * domain_stats["broken_links"] / domain_stats["total_links"]
    ).round(1)
    
    domain_stats["error_pct"] = (
        100 * domain_stats["error_links"] / domain_stats["total_links"]
    ).round(1)
    
    domain_stats["problem_pct"] = (
        100 * (domain_stats["broken_links"] + domain_stats["error_links"])
        / domain_stats["total_links"]
    ).round(1)
    

    domain_stats_html = "<ul>"
    for _, r in top_domains.iterrows():
        domain_stats_html += (
            f"<li><strong>{r['domain']}</strong>: "
            f"{r['broken_links']} / {r['total_links']} broken "
            f"({r['broken_pct']}%)</li>"
        )
    domain_stats_html += "</ul>"

    domain_table_html = (
        domain_stats
        .head(20)
        .to_html(
            classes="pivot",
            border=0,
            index=False,
        )
    )
    lang_stats = (
    df
        .groupby("lang")
        .agg(
            total_links=("url", "count"),
            broken_links=("status", lambda s: (s == "dead").sum()),
            error_links=("status", lambda s: (s == "error").sum()),
            broken_archived=("ia_status", lambda s: (s == "available").sum()),
        )
        .reset_index()
    )
    
    lang_stats["broken_lost"] = (
        lang_stats["broken_links"] - lang_stats["broken_archived"]
    )
    
    lang_stats["broken_pct"] = (
        100 * lang_stats["broken_links"] / lang_stats["total_links"]
    ).round(1)
    
    lang_stats["problem_pct"] = (
        100 * (lang_stats["broken_links"] + lang_stats["error_links"])
        / lang_stats["total_links"]
    ).round(1)

    lang_stats = lang_stats.sort_values(
        "broken_links",
        ascending=False
    )
 
    lang_stats_display = lang_stats[
        [
            "lang",
            "total_links",
            "broken_links",
            "broken_archived",
            "broken_lost",
            "broken_pct",
            "problem_pct",
        ]
    ]

    lang_table_html = (
        lang_stats_display
        .head(15)
        .to_html(
            classes="pivot",
            border=0,
            index=False,
        )
    )

    STATUS_ICON = {
        "ok":    ("fa-circle-check", "#2e7d32", "OK"),
        "dead":  ("fa-circle-xmark", "#c62828", "Broken link"),
        "error": ("fa-triangle-exclamation", "#ef6c00", "Request error"),
    }
    
    if "status" in df.columns:
        def render_status(r):
            icon, color, label = STATUS_ICON.get(
                r["status"], ("fa-question-circle", "#757575", "Unknown")
            )
            reason = r.get("reason", "")
            return (
                f'<span class="status-icon" '
                f'data-status="{r["status"]}" '
                f'title="{label}: {reason}" '
                f'style="color:{color}; font-size:14px; cursor:pointer;">'
                f'<i class="fa-solid {icon}"></i>'
                f'</span>'
            )
        df.insert(
            0,
            "status_icon",
            df.apply(render_status, axis=1)
        )

        def render_ia_icon(r):
            if r.get("ia_status") == "available" and r.get("ia_url"):
                return (
                    f'<a href="{r["ia_url"]}" target="_blank" '
                    f'title="Archived copy (Internet Archive)">'
                    f'<i class="fa-solid fa-box-archive" '
                    f'style="color:#1565c0;"></i>'
                    f'</a>'
                )
            return ""
        
        df.insert(
            1,
            "archive",
            df.apply(render_ia_icon, axis=1)
        )


    # Wikipedia: ikon + titel (byggd fr√•n lang + page_title)
    if {"lang", "page_title"}.issubset(df.columns):
        df["page_title"] = df.apply(
            lambda r: (
                f'<a href="https://{r["lang"]}.wikipedia.org/wiki/{quote(str(r["page_title"]))}" '
                f'target="_blank" title="Wikipedia ({r["lang"]})">'
                f'<i class="fa-brands fa-wikipedia-w" style="margin-right:6px;"></i>'
                f'{r["page_title"]}</a>'
                if pd.notna(r["lang"]) and pd.notna(r["page_title"])
                else r["page_title"]
            ),
            axis=1,
        )

    # Externa l√§nkar
    for col in ["Wikipedia-l√§nk", "Extern l√§nk", "url"]:
        if col in df.columns:
            df[col] = df[col].apply(
                lambda x: f'<a href="{x}" target="_blank">{x}</a>' if pd.notna(x) else ""
            )

    # --- HTML-tabell ---
    html_table = df.to_html(
        classes="pivot",
        border=0,
        escape=False,  # kr√§vs f√∂r HTML-l√§nkar
        index=False,
    )

    # --- CSS ---
    css = """
    <style>
      body {
        font-family: Arial, sans-serif;
        margin: 20px;
      }
      table.pivot {
        border-collapse: collapse;
        width: 100%;
        font-size: 12px;
      }
      table.pivot th, table.pivot td {
        border: 1px solid #999;
        padding: 6px 8px;
        text-align: left;
        vertical-align: top;
        white-space: normal;
      }
      table.pivot th {
        cursor: pointer;
        background: #f2f2f2;
      }
      table.pivot th:hover {
        background: #e2e2e2;
      }
      table.pivot thead th {
        position: sticky;
        top: 0;
        background: #f2f2f2;
        z-index: 2;
      }
      table.pivot th::after {
        content: "";
        float: right;
        opacity: 0.4;
      }
      table.pivot th.sorted-asc::after {
        content: " ‚ñ≤";
      }
      table.pivot th.sorted-desc::after {
        content: " ‚ñº";
      }
      /* Row coloring by status */
      table.pivot tr[data-status="dead"] {
         background-color: #fdecea;  /* light red */
      }
      table.pivot tr[data-status="dead"] td:nth-child(2) i {
          color: #1565c0;
        }

      table.pivot tr[data-status="error"] {
          background-color: #fff4e5;  /* light orange */
      }

      table.pivot td a {
        color: #0645ad;
        text-decoration: none;
      }
      table.pivot td a:hover {
        text-decoration: underline;
      }
      .meta {
        background: #f8f8f8;
        border: 1px solid #ccc;
        padding: 12px;
        margin-bottom: 20px;
        font-size: 13px;
      }
      .meta h2 {
        margin-top: 0;
      }
    </style>
    """

    # --- JavaScript (sortering) ---
    js = """
    <script>
    document.addEventListener('DOMContentLoaded', () => {
        // Propagate status from first cell to row
        document.querySelectorAll("table.pivot tbody tr").forEach(row => {
            const statusCell = row.querySelector(".status-icon");
            if (statusCell) {
                row.dataset.status = statusCell.dataset.status;
            }
        });
        let showOnlyBroken = false;

        document.querySelectorAll(".status-icon").forEach(icon => {
            icon.addEventListener("click", event => {
                event.stopPropagation(); // prevent column sort
                showOnlyBroken = !showOnlyBroken;
        
                document.querySelectorAll("table.pivot tbody tr").forEach(row => {
                    if (showOnlyBroken) {
                        row.style.display =
                            row.dataset.status === "dead" ? "" : "none";
                    } else {
                        row.style.display = "";
                    }
                });
            });
        });

        document.querySelectorAll("table.pivot th").forEach((header, colIndex) => {
            header.addEventListener("click", () => {
                const table = header.closest("table");
                const tbody = table.querySelector("tbody");
                const rows = Array.from(tbody.querySelectorAll("tr"));
                const asc = !header.classList.contains("sorted-asc");

                rows.sort((a, b) => {
                    const A = a.children[colIndex].innerText.trim();
                    const B = b.children[colIndex].innerText.trim();
                    const numA = parseFloat(A.replace(",", "."));
                    const numB = parseFloat(B.replace(",", "."));
                    if (!isNaN(numA) && !isNaN(numB)) {
                        return asc ? numA - numB : numB - numA;
                    }
                    return asc ? A.localeCompare(B) : B.localeCompare(A);
                });

                table.querySelectorAll("th").forEach(th =>
                    th.classList.remove("sorted-asc", "sorted-desc")
                );
                header.classList.add(asc ? "sorted-asc" : "sorted-desc");
                rows.forEach(row => tbody.appendChild(row));
            });
        });
    });
    </script>
    """
    status_counts = df["status"].value_counts()
    
    num_ok = status_counts.get("ok", 0)
    num_dead = status_counts.get("dead", 0)
    num_error = status_counts.get("error", 0)
    num_total = len(df)

    # --- Metadata ---
    meta_html = f"""
    <div class="meta">
      <h2>Summary</h2>
    
      <p><strong>Rerun:</strong> {rerun_ts}</p>
      <p><strong>Script:</strong>
         <a href="{SCRIPT_URL}" target="_blank">{SCRIPT_NAME}</a>
      </p>
    
      <p>
        <strong>Links checked:</strong> {num_total}<br>
        <strong style="color:#2e7d32;">OK:</strong> {num_ok} ({pct_ok}%)<br>
        <strong style="color:#c62828;">Broken:</strong> {num_dead} ({pct_dead}%)<br>
        &nbsp;&nbsp;‚Ü≥ Archived: {num_dead_archived}<br>
        &nbsp;&nbsp;‚Ü≥ Lost: {num_dead_lost}<br>
        <strong style="color:#ef6c00;">Errors:</strong> {num_error} ({pct_error}%)
      </p>
      <p><strong>Issue:</strong>
         <a href="{issue_url}" target="_blank">{issue_url.split("/")[-1]}</a>
      </p>
    
      <p><strong>Datak√§llor:</strong><br>
         Wikidata<br>
         MediaWiki API ‚Äì exturlusage
      </p>
    
      <h2>Domains with broken links</h2>
      <p>Top domains ranked by broken-link impact.</p>
      {domain_table_html}
    </div>
    """


    # --- Slutlig HTML ---
    html = f"""
    <html>
    <head>
      <meta charset="utf-8">
      <title>esv.se links in Wikipedia</title>
      <link rel="stylesheet"
            href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.1/css/all.min.css">
      {css}
    </head>
    <body>
      <h1>Wikipedia ‚Üí ESV v1</h1>
      {meta_html}
      <p>Sorterbar tabell. Klicka p√• kolumnrubriker f√∂r sortering.</p>
      {html_table}
      {js}
      <h2>Broken-link summary by Wikipedia language</h2>
     <p>
       Languages ranked by broken-link impact (broken + error links).
     </p>
    {lang_table_html}
    </body>
    </html>
    """

    out_path.write_text(html, encoding="utf-8")
    print(f"‚úÖ HTML skapad: {out_path}")


In [43]:

save_sortable_html_df_regeringen(df_esv)


‚úÖ HTML skapad: resultsESV/links_esv_v1_2026_01_09.html
