* [#42](https://github.com/salgo60/SamlaLibris/issues/42)
* denna Notebook [RAA42_linkroot.ipynb](https://github.com/salgo60/SamlaLibris/blob/master/notebook/RAA42_linkroot.ipynb)
   *  [links_raa_2025_11_30.html](https://salgo60.github.io/SamlaLibris/notebook/results/links_raa_2025_11_30.html) 

In [1]:
import time

from datetime import datetime

now = datetime.now()
timestamp = now.timestamp()

start_time = time.time()
print("Start:", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

Start: 2025-11-30 18:48:56


In [2]:
SCRIPT_NAME = "RAA42_linkroot.ipynb"
SCRIPT_URL = (
    "https://github.com/salgo60/SamlaLibris/"
    "blob/master/Notebook/RAA42_linkroot.ipynb"
)

In [3]:
import os

# Get the current working directory
current_directory = os.getcwd()
print("Current Working Directory:", current_directory)



Current Working Directory: /Users/salgo/Documents/GitHub/SamlaLibris/notebook


In [4]:
def read_domains(file_path):
    print(f"[DEBUG] Reading domains from: {file_path}")
    print(f"[DEBUG] Current Working Directory: {current_directory}")

    df = pd.read_csv(file_path, header=0)   # <- skip header row
    domains_list = df.iloc[:, 0].dropna().unique().tolist()
    print(f"[DEBUG] Found {len(domains_list)} domains.")
    return domains_list


In [5]:
import requests

def fetch_sitematrix_df():
    url = "https://meta.wikimedia.org/w/api.php"
    params = {
        "action": "sitematrix",
        "format": "json"
    }
    headers = {
        "User-Agent": "salgo60-language-fetcher/1.0 (salgo60@msn.com)"
    }

    print("[DEBUG] Fetching sitematrix‚Ä¶")
    r = requests.get(url, params=params, headers=headers)
    r.raise_for_status()

    if "application/json" not in r.headers.get("Content-Type", ""):
        raise ValueError("Server returned non-JSON response")

    data = r.json()["sitematrix"]

    rows = []

    # --- language-specific sites ---
    for key, lang_block in data.items():
        if not key.isdigit():
            continue  # skip "count", "specials"

        lang_code = lang_block.get("code")
        lang_name = lang_block.get("name")

        for site in lang_block.get("site", []):
            rows.append({
                "lang_code": lang_code,
                "lang_name": lang_name,
                "project": site.get("project"),
                "url": site.get("url"),
                "dbname": site.get("dbname"),
                "site_name": site.get("sitename"),
                "closed": site.get("closed", False)
            })

    # --- special wikis (Wikidata, Commons, Meta, etc.) ---
    for site in data.get("specials", []):
        rows.append({
            "lang_code": "special",
            "lang_name": "special",
            "project": site.get("project"),
            "url": site.get("url"),
            "dbname": site.get("dbname"),
            "site_name": site.get("sitename"),
            "closed": site.get("closed", False)
        })

    return pd.DataFrame(rows)


In [6]:
import requests
import pandas as pd


HEADERS = {
    "User-Agent": "salgo60-language-fetcher/2.0 (https://github.com/salgo60)"
}


df_wiki = fetch_sitematrix_df()
df_wiki["closed"] = df_wiki["closed"].fillna(False).astype(bool)
df_wiki = df_wiki[df_wiki["site_name"] == "Wikipedia"]
df_wiki.head()



[DEBUG] Fetching sitematrix‚Ä¶


Unnamed: 0,lang_code,lang_name,project,url,dbname,site_name,closed
0,aa,Qaf√°r af,,https://aa.wikipedia.org,aawiki,Wikipedia,False
5,ace,Ac√®h,,https://ace.wikipedia.org,acewiki,Wikipedia,False
7,af,Afrikaans,,https://af.wikipedia.org,afwiki,Wikipedia,False
11,ak,,,https://ak.wikipedia.org,akwiki,Wikipedia,False
18,ami,Pangcah,,https://ami.wikipedia.org,amiwiki,Wikipedia,False


In [7]:
df_wiki.info()

<class 'pandas.core.frame.DataFrame'>
Index: 207 entries, 0 to 1061
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   lang_code  207 non-null    object
 1   lang_name  206 non-null    object
 2   project    0 non-null      object
 3   url        207 non-null    object
 4   dbname     207 non-null    object
 5   site_name  207 non-null    object
 6   closed     207 non-null    bool  
dtypes: bool(1), object(6)
memory usage: 11.5+ KB


In [8]:
len(df_wiki)

207

In [9]:
# -----------------------------------------------------------
# Fetch exturlusage entries for one lang/domain
# -----------------------------------------------------------
def fetch_exturlusage(lang, domain):
    base = f"https://{lang}.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "list": "exturlusage",
        "euquery": domain,
        "eulimit": "max"
    }
    while True:
        r = session.get(base, params=params, timeout=10)
        try:
            data = r.json()
        except ValueError:
            print(f"[WARN] {lang}: JSON decode failed")
            break

        for item in data.get("query", {}).get("exturlusage", []):
            yield {
                "lang": lang,
                "page_title": item.get("title"),
                "url": item.get("url"),
                "wiki_link": f"https://{lang}.wikipedia.org/wiki/{item.get('title').replace(' ', '_')}"
            }

        if "continue" not in data:
            break
        params.update(data["continue"])
        time.sleep(0.3)

In [10]:
import os
import time
import random
import requests
import pandas as pd
from urllib.parse import urlparse
from tqdm.notebook import tqdm
file_path_domain = "sources/domains.csv"
domains = read_domains(file_path_domain)
print(domains)


[DEBUG] Reading domains from: sources/domains.csv
[DEBUG] Current Working Directory: /Users/salgo/Documents/GitHub/SamlaLibris/notebook
[DEBUG] Found 6 domains.
['raa.se', 'kringla.nu', 'fornsok.se', 'kulturnav.org', 'k-samsok.se', 'kulturmiljolagen.se']


In [20]:
# -------------------------
# Session & helpers
# -------------------------
session = requests.Session()
session.headers.update(
    {"User-Agent": "RAA-LinkAudit/1.0 salgo60@msn.com"}
)

# Filter out special languages
df_wiki_lang = df_wiki[df_wiki["lang_code"] != "special"]
print("Antal Spr√•k:", len(df_wiki_lang))

results = []

for _, row in df_wiki_lang.iterrows():
    lang = row["lang_code"]
    url = row["url"]
    lang_name = row["lang_name"]

    before = len(results)
    for d in domains:
        #print(lang, url, lang_name, d)
        before_domain = len(results)


        for entry in fetch_exturlusage(lang, d):
            entry["domain"] = d
            entry["lang_code"] = lang
            entry["lang_name"] = lang_name
            results.append(entry)
        added_domain = len(results) - before_domain
        if added_domain > 0:
            print("\t",d, "- nr added domain: ", added_domain)

    added = len(results) - before
    print(lang, lang_name, "- nr added lang: ", added)


Antal Spr√•k: 184
aa Qaf√°r af - nr added lang:  0
	 kulturnav.org - nr added domain:  3
ace Ac√®h - nr added lang:  3
	 raa.se - nr added domain:  1
	 kulturnav.org - nr added domain:  1117
af Afrikaans - nr added lang:  1118
ak None - nr added lang:  0
ami Pangcah - nr added lang:  0
	 kulturnav.org - nr added domain:  845
an aragon√©s - nr added lang:  845
	 raa.se - nr added domain:  2
	 kulturnav.org - nr added domain:  982
ast asturianu - nr added lang:  984
	 kulturnav.org - nr added domain:  11
av –∞–≤–∞—Ä - nr added lang:  11
avk Kotava - nr added lang:  0
ay Aymar aru - nr added lang:  0
	 kulturnav.org - nr added domain:  1
bar Boarisch - nr added lang:  1
bbc Batak Toba - nr added lang:  0
	 kulturnav.org - nr added domain:  22
bcl Bikol Central - nr added lang:  22
bi Bislama - nr added lang:  0
bm bamanankan - nr added lang:  0
bo ‡Ωñ‡Ωº‡Ωë‡ºã‡Ω°‡Ω≤‡ΩÇ - nr added lang:  0
br brezhoneg - nr added lang:  0
	 raa.se - nr added domain:  1
	 kulturnav.org - nr added domain:  3

In [16]:
#results

In [21]:
domains

['raa.se',
 'kringla.nu',
 'fornsok.se',
 'kulturnav.org',
 'k-samsok.se',
 'kulturmiljolagen.se']

In [22]:
df_raa = pd.DataFrame(results)

In [23]:
df_raa

Unnamed: 0,lang,page_title,url,wiki_link,domain,lang_code,lang_name
0,ace,Leonard Cohen,http://kulturnav.org/language/en/7e908441-ad8d...,https://ace.wikipedia.org/wiki/Leonard_Cohen,kulturnav.org,ace,Ac√®h
1,ace,Agnetha F√§ltskog,http://kulturnav.org/language/en/1718a400-a2bc...,https://ace.wikipedia.org/wiki/Agnetha_F√§ltskog,kulturnav.org,ace,Ac√®h
2,ace,Anni-Frid Lyngstad,http://kulturnav.org/language/en/a75e48ae-e70f...,https://ace.wikipedia.org/wiki/Anni-Frid_Lyngstad,kulturnav.org,ace,Ac√®h
3,af,Adils,http://www.raa.se/cms/extern/se_och_besoka/sev...,https://af.wikipedia.org/wiki/Adils,raa.se,af,Afrikaans
4,af,Oscar Niemeyer,http://kulturnav.org/6aa00405-f813-4e04-96b2-1...,https://af.wikipedia.org/wiki/Oscar_Niemeyer,kulturnav.org,af,Afrikaans
...,...,...,...,...,...,...,...
113007,zh,ÈòøÂÖãÂ°ûÂà©¬∑Âä†‰º¶-Âç°Ëé±Êãâ,http://kulturnav.org/05592038-b220-4690-a4b0-2...,https://zh.wikipedia.org/wiki/ÈòøÂÖãÂ°ûÂà©¬∑Âä†‰º¶-Âç°Ëé±Êãâ,kulturnav.org,zh,‰∏≠Êñá
113008,zh,Ë∑ØÊòì-ËÖìÂäõ‰∏Ä‰∏ñ,http://kulturnav.org/7996dfff-be63-44e5-a183-b...,https://zh.wikipedia.org/wiki/Ë∑ØÊòì-ËÖìÂäõ‰∏Ä‰∏ñ,kulturnav.org,zh,‰∏≠Êñá
113009,zh,ËÖìÂäõ‰∫å‰∏ñ (Â••Â∞îËâØÂÖ¨Áàµ),http://kulturnav.org/3ae5117c-3ef8-4d33-9eea-9...,https://zh.wikipedia.org/wiki/ËÖìÂäõ‰∫å‰∏ñ_(Â••Â∞îËâØÂÖ¨Áàµ),kulturnav.org,zh,‰∏≠Êñá
113010,zh,Êó•Êú¨,http://kulturnav.org/de944dd7-838e-46d0-a342-b...,https://zh.wikipedia.org/wiki/Êó•Êú¨,kulturnav.org,zh,‰∏≠Êñá


In [24]:
len(df_raa)

113012

In [25]:
df_raa = df_raa.rename(columns={
    "lang": "Spr√•k",
    "page_title": "Wikipedia-sida",
    "url": "Extern l√§nk",
    "wiki_link": "Wikipedia-l√§nk",
    "domain": "Dom√§n"
})

In [26]:
df_raa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113012 entries, 0 to 113011
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Spr√•k           113012 non-null  object
 1   Wikipedia-sida  113012 non-null  object
 2   Extern l√§nk     113012 non-null  object
 3   Wikipedia-l√§nk  113012 non-null  object
 4   Dom√§n           113012 non-null  object
 5   lang_code       113012 non-null  object
 6   lang_name       113012 non-null  object
dtypes: object(7)
memory usage: 6.0+ MB


In [27]:
import pandas as pd

# --- Stats ---
total_links = len(df_raa)
total_unique_links = df_raa['Extern l√§nk'].nunique()
num_languages = df_raa['Spr√•k'].nunique()
langs_sorted = df_raa['Spr√•k'].value_counts()

print("Total links:", total_links)
print("Total unique links:", total_unique_links)
print("Number of languages:", num_languages)
print("\nLanguages with most links:")
print(langs_sorted.to_string())


Total links: 113012
Total unique links: 60683
Number of languages: 59

Languages with most links:
Spr√•k
sv        28653
en        17243
no        11429
de         7837
es         6794
fi         6212
da         4465
ja         4305
nl         3433
nn         3148
eu         3076
id         2550
zh         2126
ro         1984
gl         1705
af         1118
ast         984
an          845
simple      509
vi          455
lb          405
nan         386
bs          384
hak         331
gv          321
ilo         313
cdo         304
ms          296
pl          271
sco         177
diq         146
tl          127
it          125
frr         104
gpe          56
sc           55
kaa          53
tum          52
tt           45
min          34
sq           33
bcl          22
se           19
ha           17
dtp          14
av           11
su           10
ig           10
ceb           5
nds           3
ace           3
so            2
bar           1
nia           1
mad           1
sw            1

In [28]:
from urllib.parse import urlparse
import pandas as pd

# Extract domains
df_raa['domain'] = df_raa['Extern l√§nk'].apply(lambda u: urlparse(u).netloc)

# Count links grouped by domain
links_per_domain = df_raa['domain'].value_counts()

print("Links per domain:")
print(links_per_domain.to_string())


Links per domain:
domain
kulturnav.org                     89490
app.raa.se                         9332
www.bebyggelseregistret.raa.se     3679
bebyggelseregistret.raa.se         3256
www.raa.se                         3105
www.fmis.raa.se                    2164
www.kringla.nu                      828
samla.raa.se                        638
pub.raa.se                          156
kmb.raa.se                          118
www.kms.raa.se                      108
kringla.nu                           71
hildebrand.raa.se                    24
raa.se                               14
old.raa.se                            7
materialguiden.raa.se                 4
fmis.raa.se                           4
vitalis.raa.se                        4
kms.raa.se                            3
www.raa.se:80                         1
www.dialog.raa.se                     1
ritreg.raa.se                         1
agendakulturarv.raa.se                1
dms.raa.se                            1
fou-anslag.raa.

In [29]:
import pandas as pd

# --- Stats ---
total_links = len(df_raa)
total_unique_links = df_raa['Extern l√§nk'].nunique()
num_languages = df_raa['Spr√•k'].nunique()
langs_sorted = df_raa['Spr√•k'].value_counts()

print("Total links:", total_links)
print("Total unique links:", total_unique_links)
print("Number of languages:", num_languages)
print("\nLanguages with most links:")
print(langs_sorted.to_string())


Total links: 113012
Total unique links: 60683
Number of languages: 59

Languages with most links:
Spr√•k
sv        28653
en        17243
no        11429
de         7837
es         6794
fi         6212
da         4465
ja         4305
nl         3433
nn         3148
eu         3076
id         2550
zh         2126
ro         1984
gl         1705
af         1118
ast         984
an          845
simple      509
vi          455
lb          405
nan         386
bs          384
hak         331
gv          321
ilo         313
cdo         304
ms          296
pl          271
sco         177
diq         146
tl          127
it          125
frr         104
gpe          56
sc           55
kaa          53
tum          52
tt           45
min          34
sq           33
bcl          22
se           19
ha           17
dtp          14
av           11
su           10
ig           10
ceb           5
nds           3
ace           3
so            2
bar           1
nia           1
mad           1
sw            1

In [30]:
from datetime import date
import os

# S√§tt datum
today = date.today().strftime("%Y_%m_%d")

# Se till att katalogen finns
os.makedirs("results", exist_ok=True)

# Bygg filnamn
outfile = f"results/links_raa_{today}.csv"

# Exportera
df_raa.to_csv(outfile, index=False, encoding="utf-8")

print(f"[OK] Exported {len(df_raa)} rows to {outfile}")


[OK] Exported 113012 rows to results/links_raa_2025_11_30.csv


In [31]:
print(df_raa.columns.tolist())

['Spr√•k', 'Wikipedia-sida', 'Extern l√§nk', 'Wikipedia-l√§nk', 'Dom√§n', 'lang_code', 'lang_name', 'domain']


In [32]:
from urllib.parse import urlparse
import pandas as pd

# Extract domain
df_raa['domain'] = df_raa['Extern l√§nk'].apply(lambda u: urlparse(u).netloc)

# Count links grouped by domain
links_per_domain = df_raa['domain'].value_counts()

print("Links per domain:")
print(links_per_domain.to_string())


Links per domain:
domain
kulturnav.org                     89490
app.raa.se                         9332
www.bebyggelseregistret.raa.se     3679
bebyggelseregistret.raa.se         3256
www.raa.se                         3105
www.fmis.raa.se                    2164
www.kringla.nu                      828
samla.raa.se                        638
pub.raa.se                          156
kmb.raa.se                          118
www.kms.raa.se                      108
kringla.nu                           71
hildebrand.raa.se                    24
raa.se                               14
old.raa.se                            7
materialguiden.raa.se                 4
fmis.raa.se                           4
vitalis.raa.se                        4
kms.raa.se                            3
www.raa.se:80                         1
www.dialog.raa.se                     1
ritreg.raa.se                         1
agendakulturarv.raa.se                1
dms.raa.se                            1
fou-anslag.raa.

In [33]:
GITHUB_REPO = "https://github.com/salgo60/Svenskaforsamlingar"
script_name = SCRIPT_NAME

In [None]:
script_name

In [None]:
# --- Stats ---
total_links = len(df_raa)
total_unique_links = df_raa["Extern l√§nk"].nunique()
langs_with_hits = sorted(df_raa["Spr√•k"].unique())

num_languages_found = len(langs_with_hits)
num_languages_checked = len(df_wiki)        # alla spr√•k som genoms√∂ktes
num_languages_found = df_raa['Spr√•k'].nunique()

In [None]:
from pathlib import Path
from datetime import date
import pandas as pd

def save_sortable_html_df_raa(
    df,
    out_dir="results",
    domains=None,
    issue_url="https://github.com/salgo60/SamlaLibris/issues/42",
):
    out_dir = Path(out_dir)
    out_dir.mkdir(exist_ok=True)

    today = date.today().strftime("%Y_%m_%d")
    out_path = out_dir / f"links_raa_{today}.html"
    rerun_ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")


    domains = domains or [] 

    # G√∂r l√§nkar klickbara om de finns
    df = df.copy()  
    if "Wikipedia-l√§nk" in df.columns:
        df["Wikipedia-l√§nk"] = df["Wikipedia-l√§nk"].apply(
            lambda x: f'<a href="{x}" target="_blank">{x}</a>' if pd.notna(x) else ""
        )
    if "Extern l√§nk" in df.columns:
        df["Extern l√§nk"] = df["Extern l√§nk"].apply(
            lambda x: f'<a href="{x}" target="_blank">{x}</a>' if pd.notna(x) else ""
        )

    html_table = df.to_html(
        classes="pivot",
        border=0,
        escape=False,   # viktigt f√∂r klickbara l√§nkar
        index=False
    )
   
    css = """
    <style>
      body {
        font-family: Arial, sans-serif;
        margin: 20px;
      }
      table.pivot {
        border-collapse: collapse;
        width: 100%;
        font-size: 12px;
      }
      table.pivot th, table.pivot td {
        border: 1px solid #999;
        padding: 6px 8px;
        text-align: left;
        vertical-align: top;
        white-space: normal;
      }
      table.pivot th {
        cursor: pointer;
        background: #f2f2f2;
      }
      table.pivot th:hover {
        background: #e2e2e2;
      } 
      table.pivot thead th {
          position: sticky;
          top: 0;
          background: #f2f2f2;
          z-index: 2;
        }
        
        /* Sorting indicators */
        table.pivot th::after {
          content: "";
          float: right;
          opacity: 0.4;
        }
        
        table.pivot th.sorted-asc::after {
          content: " ‚ñ≤";
        }
        
        table.pivot th.sorted-desc::after {
          content: " ‚ñº";
        }
          .meta {
            background: #f8f8f8;
            border: 1px solid #ccc;
            padding: 12px;
            margin-bottom: 20px;
            font-size: 13px;
          }
          .meta h2 {
            margin-top: 0;
          }
          .meta ul {
            margin: 0;
            padding-left: 18px;
          }
    </style>
    """

    js = """
    <script>
    document.addEventListener('DOMContentLoaded', () => {
        document.querySelectorAll("table.pivot th").forEach((header, colIndex) => {
            header.addEventListener("click", () => {
                const table = header.closest("table");
                const tbody = table.querySelector("tbody");
                const rows = Array.from(tbody.querySelectorAll("tr"));
                const asc = !header.classList.contains("sorted-asc");

                rows.sort((a, b) => {
                    const A = a.children[colIndex].innerText.trim();
                    const B = b.children[colIndex].innerText.trim();

                    const numA = parseFloat(A.replace(",", ".")) || null;
                    const numB = parseFloat(B.replace(",", ".")) || null;

                    if (numA !== null && numB !== null) {
                        return asc ? numA - numB : numB - numA;
                    }
                    return asc ? A.localeCompare(B) : B.localeCompare(A);
                });

                table.querySelectorAll("th").forEach(th =>
                    th.classList.remove("sorted-asc","sorted-desc")
                );
                header.classList.add(asc ? "sorted-asc" : "sorted-desc");

                rows.forEach(row => tbody.appendChild(row));
            });
        });
    });
    </script>
    """
    meta_html = f"""
    <div class="meta">
      <h2>Metadata</h2>
    
      <p><strong>Rerun:</strong> {rerun_ts}</p>
      <p><strong>Script:</strong>
         <a href="{SCRIPT_URL}" target="_blank">{SCRIPT_NAME}</a>
      </p>
      <p><strong>Issue:</strong>
         <a href="{issue_url}" target="_blank">{issue_url.split('/')[-1]}</a>
      </p>
    
      <p><strong>Datak√§llor:</strong></p>
      <div class="meta-indent">
        Wikidata<br>
        MediaWiki API ‚Äì exturlusage
      </div>
    
      <p><strong>Parametrar:</strong></p>
      <div class="meta-indent">
        Spr√•kfilter: exkluderar "special"
      </div>
    
      <p><strong>Antal spr√•k genoms√∂kta:</strong> {num_languages_checked}</p>
      <p><strong>Antal spr√•k med tr√§ffar:</strong> {num_languages_found}</p>
      <p><strong>Wikipedia-spr√•k med tr√§ffar:</strong> {", ".join(langs_with_hits)}</p>
      <p><strong>Totala l√§nkar:</strong> {total_links}</p>
      <p><strong>Unika l√§nkar:</strong> {total_unique_links}</p>
    </div>
    """
    
    html = f"""
    <html>
    <head>
      <meta charset="utf-8">
      <title>RA√Ñ links in Wikipedia</title>
      {css}
    </head>
    <body>
      <h1>Wikipedia ‚Üí RA√Ñ</h1>
      {meta_html}
      <p>Sorterbar tabell. Klicka p√• kolumnrubriker f√∂r sortering.</p>
      {html_table}
      {js}
    </body>
    </html>
    """

    out_path.write_text(html, encoding="utf-8")
    print(f"‚úÖ HTML skapad: {out_path}")


In [None]:
save_sortable_html_df_raa(df_raa)


In [None]:
 # End timer and calculate duration
end_time = time.time()
elapsed_time = end_time - start_time# Bygg audit-lager f√∂r den h√§r etappen

# Print current date and total time
print("Date:", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
minutes, seconds = divmod(elapsed_time, 60)
print("Total time elapsed: {:02.0f} minutes {:05.2f} seconds".format(minutes, seconds))
