* [#4](https://github.com/salgo60/SCB-Wikidata/issues/4)
* denna [SCB4_linkroot.ipynb](https://github.com/salgo60/SCB-Wikidata/blob/main/notebook/SCB4_linkroot.ipynb)

In [1]:
import time

from datetime import datetime

now = datetime.now()
timestamp = now.timestamp()

start_time = time.time()
print("Start:", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

Start: 2025-11-17 01:39:41


In [2]:
def log(level, msg):
    ts = datetime.now().strftime("%H:%M:%S")
    print(f"[{ts}] [{level}] {msg}")


In [3]:
def read_language_codes(file_path): 
    print(f"[DEBUG] Reading language codes from: {file_path}") 
    df = pd.read_csv(file_path) 
    lang_codes = df['Language Code'].dropna().unique().tolist() 
    print(f"[DEBUG] Found {len(lang_codes)} language codes.") 
    return lang_codes 
    
def read_domains(file_path): 
    print(f"[DEBUG] Reading domains from: {file_path}") 
    df = pd.read_csv(file_path, header=None, names=['domain']) 
    domains_list = df['domain'].dropna().unique().tolist() 
    print(f"[DEBUG] Found {len(domains_list)} domains.") 
    return domains_list

In [4]:
import os
import time 
import pandas as pd
from urllib.parse import urlparse

CHECKPOINT_LINKS  = "results/_checkpoint_exturlusage.csv"
CHECKPOINT_STATUS = "results/_checkpoint_status.csv"
OUT_ALL           = "results/all_links.csv"
OUT_DEAD          = "results/dead_links.csv"

# -------------------------------------------------
# LADDA LÄNKAR
# -------------------------------------------------
df = pd.read_csv(CHECKPOINT_LINKS)

print("Alla rader i checkpoint:", len(df))

# Ta unika URL:er
urls = df["url"].dropna().unique().tolist()
total = len(urls)
print(f"Totalt unika URL: {total}")

# -------------------------------------------------
# LÄNKAR PER WIKIPEDIA-SPRÅK
# -------------------------------------------------

# Försök hitta vilken kolumn som innehåller språkkoden
possible_lang_cols = ["lang", "language", "wiki", "wiki_lang", "language_code"]
lang_col = None

for c in possible_lang_cols:
    if c in df.columns:
        lang_col = c
        break

if lang_col is None:
    print("⚠️ Hittar ingen språk-kolumn i datan! Inga språk-stats kan visas.")
else:
    print(f"\nSpråk-kolumn hittad: '{lang_col}'")

    # Totalt antal länkar per språk (inkl. dubbletter)
    print("\n--- Statistik per språk (alla rader) ---")
    lang_counts_all = df[lang_col].value_counts()
    for lang, cnt in lang_counts_all.items():
        print(f"{lang:5} {cnt}")

    # Antal UNIKA URLer per språk
    print("\n--- Statistik per språk (unika URL:er) ---")
    df_unique = df.drop_duplicates(subset=["url"])
    lang_counts_unique = df_unique[lang_col].value_counts()
    for lang, cnt in lang_counts_unique.items():
        print(f"{lang:5} {cnt}")

    print("\nAntal språk:", len(lang_counts_all))

# -------------------------------------------------
# DOMÄNSTATISTIK
# -------------------------------------------------
domains = [urlparse(u).netloc.lower() for u in urls]
domain_counts = pd.Series(domains).value_counts()

print("\n--- Statistik per domän (flest först) ---")
for dom, cnt in domain_counts.items():
    print(f"{dom:35} {cnt}")

print("\nAntal domäner:", len(domain_counts))

# -------------------------------------------------
# LADDA PREVIOUS STATUS (resume)
# -------------------------------------------------
status_map = {}
if os.path.exists(CHECKPOINT_STATUS):
    prev = pd.read_csv(CHECKPOINT_STATUS)
    status_map = dict(zip(prev["url"], prev["status"]))
    print("\nLaddade tidigare status:", len(status_map))

# Filtrera bort redan testade länkar
remaining = [u for u in urls if u not in status_map]

print("\n--- Sammanfattning ---")
print(f"Totalt unika URL:         {total}")
print(f"Redan testade:             {len(status_map)}")
print(f"Kvar att testa:            {len(remaining)}")
print(f"Antal domäner:             {len(domain_counts)}")

# Visa topp 10 domäner
print("\nTopp 10 domäner:")
print(domain_counts.head(10))



Alla rader i checkpoint: 110927
Totalt unika URL: 23871

Språk-kolumn hittad: 'lang'

--- Statistik per språk (alla rader) ---
sv    71851
fi    9316
no    6638
en    6408
pl    2668
de    2224
it    2122
ro    1905
da    1818
es    1453
hu    1193
pt    513
ru    500
tr    395
nn    362
nl    352
is    315
vi    226
hr    174
cs    140
ca    96
se    79
sl    56
et    41
lv    28
eo    19
sk    18
so    6
rm    4
cy    3
ga    2
ceb   2

--- Statistik per språk (unika URL:er) ---
sv    20931
da    1304
en    815
fi    341
de    103
es    75
ru    56
pt    44
no    36
pl    25
it    21
nl    17
hu    16
cs    13
ro    12
ca    11
tr    10
hr    7
nn    6
et    6
is    6
vi    5
lv    2
eo    2
rm    2
sk    2
ga    1
sl    1
so    1

Antal språk: 32

--- Statistik per domän (flest först) ---
geodata.scb.se                      18369
www.scb.se                          2931
kommunsiffror.scb.se                1444
www.statistikdatabasen.scb.se       472
share.scb.se                     

In [5]:
# -----------------------------------------------------------
# Fetch exturlusage entries for one lang/domain
# -----------------------------------------------------------
def fetch_exturlusage(lang, domain):
    base = f"https://{lang}.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "list": "exturlusage",
        "euquery": domain,
        "eulimit": "max"
    }
    while True:
        r = session.get(base, params=params, timeout=10)
        data = r.json()
        for item in data.get("query", {}).get("exturlusage", []):
            yield {
                "lang": lang,
                "page_title": item.get("title"),
                "url": item.get("url"),
                "wiki_link": f"https://{lang}.wikipedia.org/wiki/{item.get('title').replace(' ', '_')}"
            }

        if "continue" not in data:
            break
        params.update(data["continue"])
        time.sleep(0.3)
# -----------------------------------------------------------
# Modified main() with resume support
# -----------------------------------------------------------


In [None]:
# ============================================
#  SCB LINK CHECKER — Notebook Optimized
#  Stabil, resume, progressbar, adaptiv hastighet
# ============================================

import os
import time
import random
import requests
import pandas as pd
from urllib.parse import urlparse
from tqdm.notebook import tqdm

# -------------------------
# KONFIG
# -------------------------
CHECKPOINT_LINKS  = "results/_checkpoint_exturlusage.csv"
CHECKPOINT_STATUS = "results/_checkpoint_status.csv"
OUT_ALL           = "results/all_links.csv"
OUT_DEAD          = "results/dead_links.csv"

# Adaptiv request-hantering
REQUEST_TIMEOUT   = 1.2       # SCB klarar inte kortare timeout
PAUSE_MIN         = 0.10
PAUSE_MAX         = 1.50
ADAPT_UP          = 0.10
ADAPT_DOWN        = 0.04
ROLLING_N         = 150
SAVE_EVERY        = 200        # checkpoint oftare
ERR_RATE_THRESHOLD = 0.50
LATENCY_THRESHOLD  = 1.5
COOLDOWN_BASE      = 60
COOLDOWN_MAX       = 1800

session = requests.Session()
session.headers.update({
    "User-Agent": "SCB-LinkTest/1.0 (contact: salgo60@msn.com)"
})

SCB_SOFT_404 = [
    "sidan kunde inte hittas",
    "vi vill gärna hjälpa dig vidare",
    "snabba fakta om sverige",
    "ta dig vidare till",
]

def detect_scb_soft_404(url, response):
    text = response.text.lower()
    if any(sig in text for sig in SCB_SOFT_404):
        return "SOFT_404"
    if response.status_code == 404:
        return "HARD_404"
    return 200

def check_status(url):
    t0 = time.time()
    try:
        r = session.get(url, timeout=REQUEST_TIMEOUT, allow_redirects=True, stream=True)
        code = r.status_code

        if "scb.se" in url.lower():
            return detect_scb_soft_404(url, r), time.time() - t0

        return code, time.time() - t0

    except Exception:
        return "ERROR", time.time() - t0


# -------------------------
# LADDA LÄNKAR
# -------------------------
if not os.path.exists(CHECKPOINT_LINKS):
    raise FileNotFoundError("Hittar inte länkar i results/_checkpoint_exturlusage.csv – kör exturlusage först!")

df = pd.read_csv(CHECKPOINT_LINKS)
urls = df["url"].dropna().unique().tolist()
print("Totalt unika URL:", len(urls))

# -------------------------
# LADDA STATUS-CHECKPOINT
# -------------------------
status_map = {}
if os.path.exists(CHECKPOINT_STATUS):
    prev = pd.read_csv(CHECKPOINT_STATUS)
    status_map = dict(zip(prev["url"], prev["status"]))
    print("Laddade tidigare status:", len(status_map))

remaining = [u for u in urls if u not in status_map]
print("Återstår att testa:", len(remaining))

# -------------------------
# ADAPTIV LOOP
# -------------------------
pause = 0.30
recent = []
cooldown_factor = 0

pbar = tqdm(total=len(remaining), desc="Kollar SCB-länkar")

for url in remaining:

    status, latency = check_status(url)
    status_map[url] = status

    # Rolling window
    is_err = 1 if status not in (200,) else 0
    recent.append((latency, is_err))
    if len(recent) > ROLLING_N:
        recent = recent[-ROLLING_N:]

    latency_avg = sum(t for t,e in recent) / len(recent)
    err_rate = sum(e for _,e in recent) / len(recent)

    # Cooldown vid dålig period
    if latency_avg > LATENCY_THRESHOLD or err_rate > ERR_RATE_THRESHOLD:
        cooldown_factor = min(cooldown_factor + 1, 6)
        cooldown = min(COOLDOWN_BASE * (2 ** (cooldown_factor - 1)), COOLDOWN_MAX)

        print(f"[COOLDOWN] lat={latency_avg:.2f}s err={err_rate:.2%} → väntar {cooldown}s")
        pd.DataFrame([{"url":u,"status":s} for u,s in status_map.items()]).to_csv(CHECKPOINT_STATUS, index=False)

        time.sleep(cooldown)
        recent = []
        pause = PAUSE_MAX
    else:
        if cooldown_factor > 0:
            cooldown_factor = max(0, cooldown_factor - 1)

    # Adaptiv paus
    if latency_avg > 0.9:
        pause = min(PAUSE_MAX, pause + ADAPT_UP)
    elif latency_avg < 0.5:
        pause = max(PAUSE_MIN, pause - ADAPT_DOWN)

    jitter = random.uniform(-0.04, 0.12)
    actual_sleep = max(PAUSE_MIN, min(PAUSE_MAX, pause + jitter))

    pbar.set_postfix({
        "lat": f"{latency_avg:.2f}s",
        "pause": f"{actual_sleep:.2f}s",
        "err": f"{err_rate:.2%}",
        "status": status
    })

    time.sleep(actual_sleep)
    pbar.update(1)

    # Checkpoint
    if pbar.n % SAVE_EVERY == 0:
        pd.DataFrame([{"url":u,"status":s} for u,s in status_map.items()]).to_csv(CHECKPOINT_STATUS, index=False)

pbar.close()

# -------------------------
# SPARA RESULTAT
# -------------------------
pd.DataFrame([{"url":u,"status":s} for u,s in status_map.items()]).to_csv(CHECKPOINT_STATUS, index=False)

df["status"] = df["url"].map(status_map)
df.to_csv(OUT_ALL, index=False)

dead = df[df["status"].isin(["HARD_404", "SOFT_404", "ERROR"])]
dead.to_csv(OUT_DEAD, index=False)

print("KLART!")
print("All links:", OUT_ALL)
print("Dead links:", OUT_DEAD)


Totalt unika URL: 23871
Laddade tidigare status: 3166
Återstår att testa: 20705


Kollar SCB-länkar:   0%|          | 0/20705 [00:00<?, ?it/s]

[COOLDOWN] lat=0.05s err=100.00% → väntar 60s
[COOLDOWN] lat=0.11s err=100.00% → väntar 120s
[COOLDOWN] lat=0.02s err=100.00% → väntar 240s
[COOLDOWN] lat=0.04s err=100.00% → väntar 480s
[COOLDOWN] lat=0.03s err=100.00% → väntar 960s
[COOLDOWN] lat=0.02s err=100.00% → väntar 1800s
[COOLDOWN] lat=0.05s err=100.00% → väntar 1800s
[COOLDOWN] lat=0.27s err=100.00% → väntar 1800s
[COOLDOWN] lat=0.02s err=100.00% → väntar 1800s
[COOLDOWN] lat=0.12s err=100.00% → väntar 1800s
[COOLDOWN] lat=0.18s err=100.00% → väntar 1800s


In [None]:
# Heatmap + CSV/HTML export + bar chart (Notebook)
import os
import pandas as pd
from urllib.parse import urlparse
import numpy as np
import matplotlib.pyplot as plt

# Helper for interactive display (optional)
try:
    from ace_tools import display_dataframe_to_user
    have_display = True
except Exception:
    have_display = False

# Paths - ändra om din fil ligger annorlunda
CHECKPOINT_LINKS = "results/_checkpoint_exturlusage.csv"
OUT_CSV = "results/heatmap_counts2.csv"
OUT_HTML = "results/heatmap_counts2.html"
OUT_LOG = "results/domain_lang_summary.csv"

# Kontrollera fil
if not os.path.exists(CHECKPOINT_LINKS):
    raise FileNotFoundError(f"Input file not found: {CHECKPOINT_LINKS}. Flytta filen eller uppdatera CHECKPOINT_LINKS.")

# Läs in
df = pd.read_csv(CHECKPOINT_LINKS)

# Försök hitta språk-kolumn
possible_lang_cols = ["lang", "language", "wiki", "wiki_lang", "language_code"]
lang_col = next((c for c in possible_lang_cols if c in df.columns), None)
if lang_col is None:
    for c in df.columns:
        if c.lower() in possible_lang_cols:
            lang_col = c
            break
if lang_col is None:
    raise ValueError("Hittar ingen språk-kolumn i CSV. Förväntade någon av: " + ", ".join(possible_lang_cols))

# Extrahera domän
def extract_domain(u):
    try:
        return urlparse(u).netloc.lower()
    except Exception:
        return ""
df['domain'] = df['url'].fillna('').apply(extract_domain)
df[lang_col] = df[lang_col].astype(str).str.strip()

# Pivot: språk x domän
pivot = df.pivot_table(index=lang_col, columns='domain', values='url', aggfunc='count', fill_value=0)
lang_totals = pivot.sum(axis=1).sort_values(ascending=False)
pivot = pivot.reindex(lang_totals.index)

# Totals per domain
domain_totals = pivot.sum(axis=0).sort_values(ascending=False)

# Save outputs
pivot.to_csv(OUT_CSV)
pivot.to_html(OUT_HTML)
summary = pivot.reset_index().melt(id_vars=[lang_col], var_name='domain', value_name='count')
summary = summary.sort_values(['count'], ascending=False)
summary.to_csv(OUT_LOG, index=False)

# Present a preview
preview = pivot.copy()
if preview.shape[1] > 40:
    preview = preview.iloc[:, :40]
if have_display:
    display_dataframe_to_user("Language x Domain counts (preview)", preview.reset_index())
else:
    print("Preview (truncated to first 40 domains):")
    display(preview.reset_index().head(40))

# --- Heatmap (top domains) ---
TOP_DOMAINS = 30
top_domains = domain_totals.head(TOP_DOMAINS).index.tolist()
plot_pivot = pivot[top_domains]

plt.figure(figsize=(max(8, len(top_domains)*0.35), max(6, len(plot_pivot.index)*0.25)))
plt.imshow(plot_pivot.values, aspect='auto', interpolation='nearest')
plt.colorbar(label='Count')
plt.xticks(ticks=np.arange(len(top_domains)), labels=top_domains, rotation=90, fontsize=8)
plt.yticks(ticks=np.arange(len(plot_pivot.index)), labels=plot_pivot.index, fontsize=8)
plt.title('Heatmap: links per language × top domains')
plt.tight_layout()
plt.show()

# --- Bar chart of top domains overall ---
top20 = domain_totals.head(20)
plt.figure(figsize=(10,4))
plt.bar(range(len(top20)), top20.values)
plt.xticks(range(len(top20)), top20.index, rotation=90, fontsize=8)
plt.ylabel('Total links')
plt.title('Top 20 domains by total links')
plt.tight_layout()
plt.show()

# Filvägar
print("\nSaved files:")
print(f"- Full pivot CSV: {OUT_CSV}")
print(f"- Full pivot HTML: {OUT_HTML}")
print(f"- Long-form summary CSV: {OUT_LOG}")


In [None]:
import streamlit as st
import pandas as pd

st.title("SCB Link Audit — Wikipedia")

st.write("Visualisering av länkröta relaterad till scb.se på Wikipedia.")

df_all = pd.read_csv("results/all_links.csv")
df_dead = pd.read_csv("results/dead_links.csv")
df_art = pd.read_csv("results/stats_per_article.csv")
df_path = pd.read_csv("results/stats_per_domain_path.csv")

st.header("1. Översikt")
col1, col2, col3 = st.columns(3)

col1.metric("Totala länkar", len(df_all))
col2.metric("Döda länkar", len(df_dead))
col3.metric("Döda (%)", f"{100 * len(df_dead)/len(df_all):.2f}%")

st.header("2. Status-fördelning")
st.bar_chart(df_all["status"].value_counts())

st.header("3. Artiklar med flest trasiga länkar")
bad = df_art[df_art["dead_links"] > 0].sort_values("dead_links", ascending=False)
st.dataframe(bad)

st.header("4. SCB paths där flest länkar gått sönder")
st.dataframe(df_path.sort_values("dead_ratio", ascending=False))

st.header("5. Alla länkar")
st.dataframe(df_all)


In [None]:
 # End timer and calculate duration
end_time = time.time()
elapsed_time = end_time - start_time# Bygg audit-lager för den här etappen

# Print current date and total time
print("Date:", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
minutes, seconds = divmod(elapsed_time, 60)
print("Total time elapsed: {:02.0f} minutes {:05.2f} seconds".format(minutes, seconds))
