* [#42](https://github.com/salgo60/SamlaLibris/issues/42)
* RAA42_linkroot.ipynb

In [1]:
import time

from datetime import datetime

now = datetime.now()
timestamp = now.timestamp()

start_time = time.time()
print("Start:", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

Start: 2025-11-30 11:56:03


In [2]:
SCRIPT_NAME = "RAA42_linkroot.ipynb"
SCRIPT_URL = (
    "https://github.com/salgo60/SamlaLibris/"
    "blob/master/Notebook/RAA42_linkroot.ipynb"
)

In [None]:
import os

# Get the current working directory
current_directory = os.getcwd()
print("Current Working Directory:", current_directory)



In [26]:
def read_domains(file_path):
    print(f"[DEBUG] Reading domains from: {file_path}")
    print(f"[DEBUG] Current Working Directory: {current_directory}")

    df = pd.read_csv(file_path, header=0)   # <- skip header row
    domains_list = df.iloc[:, 0].dropna().unique().tolist()
    print(f"[DEBUG] Found {len(domains_list)} domains.")
    return domains_list


In [27]:
import requests

def fetch_sitematrix_df():
    url = "https://meta.wikimedia.org/w/api.php"
    params = {
        "action": "sitematrix",
        "format": "json"
    }
    headers = {
        "User-Agent": "salgo60-language-fetcher/1.0 (salgo60@msn.com)"
    }

    print("[DEBUG] Fetching sitematrix‚Ä¶")
    r = requests.get(url, params=params, headers=headers)
    r.raise_for_status()

    if "application/json" not in r.headers.get("Content-Type", ""):
        raise ValueError("Server returned non-JSON response")

    data = r.json()["sitematrix"]

    rows = []

    # --- language-specific sites ---
    for key, lang_block in data.items():
        if not key.isdigit():
            continue  # skip "count", "specials"

        lang_code = lang_block.get("code")
        lang_name = lang_block.get("name")

        for site in lang_block.get("site", []):
            rows.append({
                "lang_code": lang_code,
                "lang_name": lang_name,
                "project": site.get("project"),
                "url": site.get("url"),
                "dbname": site.get("dbname"),
                "site_name": site.get("sitename"),
                "closed": site.get("closed", False)
            })

    # --- special wikis (Wikidata, Commons, Meta, etc.) ---
    for site in data.get("specials", []):
        rows.append({
            "lang_code": "special",
            "lang_name": "special",
            "project": site.get("project"),
            "url": site.get("url"),
            "dbname": site.get("dbname"),
            "site_name": site.get("sitename"),
            "closed": site.get("closed", False)
        })

    return pd.DataFrame(rows)


In [28]:
import requests
import pandas as pd


HEADERS = {
    "User-Agent": "salgo60-language-fetcher/2.0 (https://github.com/salgo60)"
}


df_wiki = fetch_sitematrix_df()
df_wiki["closed"] = df_lang_fetch["closed"].fillna(False).astype(bool)
df_wiki = df_wiki[df_wiki["site_name"] == "Wikipedia"]
df_wiki.head()



[DEBUG] Fetching sitematrix‚Ä¶


Unnamed: 0,lang_code,lang_name,project,url,dbname,site_name,closed
0,aa,Qaf√°r af,,https://aa.wikipedia.org,aawiki,Wikipedia,False
5,ace,Ac√®h,,https://ace.wikipedia.org,acewiki,Wikipedia,False
7,af,Afrikaans,,https://af.wikipedia.org,afwiki,Wikipedia,False
11,ak,,,https://ak.wikipedia.org,akwiki,Wikipedia,False
18,ami,Pangcah,,https://ami.wikipedia.org,amiwiki,Wikipedia,False


In [29]:
df_wiki.info()

<class 'pandas.core.frame.DataFrame'>
Index: 207 entries, 0 to 1061
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   lang_code  207 non-null    object
 1   lang_name  206 non-null    object
 2   project    0 non-null      object
 3   url        207 non-null    object
 4   dbname     207 non-null    object
 5   site_name  207 non-null    object
 6   closed     207 non-null    bool  
dtypes: bool(1), object(6)
memory usage: 11.5+ KB


In [30]:
len(df_wiki)

207

In [31]:
# -----------------------------------------------------------
# Fetch exturlusage entries for one lang/domain
# -----------------------------------------------------------
def fetch_exturlusage(lang, domain):
    base = f"https://{lang}.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "list": "exturlusage",
        "euquery": domain,
        "eulimit": "max"
    }
    while True:
        r = session.get(base, params=params, timeout=10)
        try:
            data = r.json()
        except ValueError:
            print(f"[WARN] {lang}: JSON decode failed")
            break

        for item in data.get("query", {}).get("exturlusage", []):
            yield {
                "lang": lang,
                "page_title": item.get("title"),
                "url": item.get("url"),
                "wiki_link": f"https://{lang}.wikipedia.org/wiki/{item.get('title').replace(' ', '_')}"
            }

        if "continue" not in data:
            break
        params.update(data["continue"])
        time.sleep(0.3)

In [32]:
import os
import time
import random
import requests
import pandas as pd
from urllib.parse import urlparse
from tqdm.notebook import tqdm
file_path_domain = "sources/domains.csv"
domains = read_domains(file_path_domain)
print(domains)


[DEBUG] Reading domains from: sources/domains.csv
[DEBUG] Current Working Directory: /Users/salgo/Documents/GitHub/SamlaLibris/notebook
[DEBUG] Found 2 domains.
['raa.se', 'kringla.nu']


In [33]:

# -------------------------
# Session & helpers
# -------------------------
session = requests.Session()
session.headers.update({"User-Agent": "SCB-LinkAudit/1.0 (https://www.scb.se/)"})

# we need some filtering  
df_wiki_lang = df_wiki[df_wiki["lang_code"] != "special"] 

print("Antal Spr√•k:",len(df_wiki_lang ))
results = []
for _, row in df_wiki_lang.iterrows():
    lang = row["lang_code"]
    url  = row["url"]
    lang_name = row["lang_name"]
    before = len(results)
    #print(lang, url, lang_name,domains)
    for entry in fetch_exturlusage(lang, domains):
        results.append(entry)     
    after = len(results) 
    links = after-before
    print(lang, url, lang_name," - ", links)
    


Antal Spr√•k: 184
aa https://aa.wikipedia.org Qaf√°r af  -  0
ace https://ace.wikipedia.org Ac√®h  -  0
af https://af.wikipedia.org Afrikaans  -  0
ak https://ak.wikipedia.org None  -  0
ami https://ami.wikipedia.org Pangcah  -  0
an https://an.wikipedia.org aragon√©s  -  0
ast https://ast.wikipedia.org asturianu  -  0
av https://av.wikipedia.org –∞–≤–∞—Ä  -  0
avk https://avk.wikipedia.org Kotava  -  0
ay https://ay.wikipedia.org Aymar aru  -  0
bar https://bar.wikipedia.org Boarisch  -  0
bbc https://bbc.wikipedia.org Batak Toba  -  0
bcl https://bcl.wikipedia.org Bikol Central  -  0
bi https://bi.wikipedia.org Bislama  -  0
bm https://bm.wikipedia.org bamanankan  -  0
bo https://bo.wikipedia.org ‡Ωñ‡Ωº‡Ωë‡ºã‡Ω°‡Ω≤‡ΩÇ  -  0
br https://br.wikipedia.org brezhoneg  -  0
bs https://bs.wikipedia.org bosanski  -  0
btm https://btm.wikipedia.org Batak Mandailing  -  0
bug https://bug.wikipedia.org Basa Ugi  -  0
bxr https://bxr.wikipedia.org –±—É—Ä—è–∞–¥  -  0
cbk-zam https://cbk-zam.wikipe

In [34]:
domains

['raa.se', 'kringla.nu']

In [35]:
df_raa = pd.DataFrame(results)

In [36]:
df_raa

Unnamed: 0,lang,page_title,url,wiki_link
0,da,Gribshunden,http://www.kringla.nu/kringla/objekt?referens=...,https://da.wikipedia.org/wiki/Gribshunden
1,da,Guldhornene,http://www.kringla.nu/kringla/objekt?referens=...,https://da.wikipedia.org/wiki/Guldhornene
2,de,"Riks√§pplet (Schiff, 1663)",http://www.kringla.nu/kringla/objekt?referens=...,https://de.wikipedia.org/wiki/Riks√§pplet_(Schi...
3,de,Gamla bankhuset,http://www.kringla.nu/kringla/objekt?referens=...,https://de.wikipedia.org/wiki/Gamla_bankhuset
4,de,Kirche von Akeb√§ck,http://www.kringla.nu/kringla/objekt?referens=...,https://de.wikipedia.org/wiki/Kirche_von_Akeb√§ck
...,...,...,...,...
894,sv,Margit Webj√∂rn,https://kringla.nu/kringla/objekt?text=margit+...,https://sv.wikipedia.org/wiki/Margit_Webj√∂rn
895,sv,Margit Webj√∂rn,https://www.kringla.nu/kringla/objekt;jsession...,https://sv.wikipedia.org/wiki/Margit_Webj√∂rn
896,sw,Betty Olsson,https://www.kringla.nu/kringla/objekt?referens...,https://sw.wikipedia.org/wiki/Betty_Olsson
897,vi,Tr·∫≠n L√ºtzen (1632),http://www.kringla.nu/kringla/objekt;jsessioni...,https://vi.wikipedia.org/wiki/Tr·∫≠n_L√ºtzen_(1632)


In [37]:
len(df_raa)

899

In [38]:
import pandas as pd

# --- Stats ---
total_links = len(df_raa)
total_unique_links = df_raa['url'].nunique()
num_languages = df_raa['lang'].nunique()
langs_sorted = df_raa['lang'].value_counts()

print("Total links:", total_links)
print("Total unique links:", total_unique_links)
print("Number of languages:", num_languages)
print("\nLanguages with most links:")
print(langs_sorted.to_string())


Total links: 899
Total unique links: 804
Number of languages: 17

Languages with most links:
lang
sv        735
no         83
en         48
de         10
es          5
fi          5
da          2
it          2
sq          1
vi          1
sw          1
nn          1
simple      1
pl          1
ja          1
ha          1
zh          1


In [39]:
from urllib.parse import urlparse
import pandas as pd

# Extract domains
df_raa['domain'] = df_raa['url'].apply(lambda u: urlparse(u).netloc)

# Count links grouped by domain
links_per_domain = df_raa['domain'].value_counts()

print("Links per domain:")
print(links_per_domain.to_string())


Links per domain:
domain
www.kringla.nu    828
kringla.nu         71


In [40]:
df_raa = df_raa.rename(columns={
    "lang": "Spr√•k",
    "page_title": "Wikipedia-sida",
    "url": "Extern l√§nk",
    "wiki_link": "Wikipedia-l√§nk",
    "domain": "Dom√§n"
})


In [41]:
import pandas as pd

# --- Stats ---
total_links = len(df_raa)
total_unique_links = df_raa['Extern l√§nk'].nunique()
num_languages = df_raa['Spr√•k'].nunique()
langs_sorted = df_raa['Spr√•k'].value_counts()

print("Total links:", total_links)
print("Total unique links:", total_unique_links)
print("Number of languages:", num_languages)
print("\nLanguages with most links:")
print(langs_sorted.to_string())


Total links: 899
Total unique links: 804
Number of languages: 17

Languages with most links:
Spr√•k
sv        735
no         83
en         48
de         10
es          5
fi          5
da          2
it          2
sq          1
vi          1
sw          1
nn          1
simple      1
pl          1
ja          1
ha          1
zh          1


In [43]:
from datetime import date
import os

# S√§tt datum
today = date.today().strftime("%Y_%m_%d")

# Se till att katalogen finns
os.makedirs("results", exist_ok=True)

# Bygg filnamn
outfile = f"results/links_raa_{today}.csv"

# Exportera
df_raa.to_csv(outfile, index=False, encoding="utf-8")

print(f"[OK] Exported {len(df_raa)} rows to {outfile}")


[OK] Exported 899 rows to results/links_raa_2025_11_30.csv


In [44]:
print(df_raa.columns.tolist())

['Spr√•k', 'Wikipedia-sida', 'Extern l√§nk', 'Wikipedia-l√§nk', 'Dom√§n']


In [45]:
from urllib.parse import urlparse
import pandas as pd

# Extract domain
df_raa['domain'] = df_raa['Extern l√§nk'].apply(lambda u: urlparse(u).netloc)

# Count links grouped by domain
links_per_domain = df_raa['domain'].value_counts()

print("Links per domain:")
print(links_per_domain.to_string())


Links per domain:
domain
www.kringla.nu    828
kringla.nu         71


In [47]:
GITHUB_REPO = "https://github.com/salgo60/Svenskaforsamlingar"
script_name = SCRIPT_NAME

In [52]:
script_name

'RAA42_linkroot.ipynb'

In [53]:
# --- Stats ---
total_links = len(df_raa)
total_unique_links = df_raa["Extern l√§nk"].nunique()
langs_with_hits = sorted(df_raa["Spr√•k"].unique())

num_languages_found = len(langs_with_hits)
num_languages_checked = len(df_wiki)        # alla spr√•k som genoms√∂ktes
num_languages_found = df_raa['Spr√•k'].nunique()

In [56]:
from pathlib import Path
from datetime import date
import pandas as pd

def save_sortable_html_df_raa(
    df,
    out_dir="results",
    domains=None,
    issue_url="https://github.com/salgo60/SamlaLibris/issues/42",
):
    out_dir = Path(out_dir)
    out_dir.mkdir(exist_ok=True)

    today = date.today().strftime("%Y_%m_%d")
    out_path = out_dir / f"links_raa_{today}.html"
    rerun_ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")


    domains = domains or [] 

    # G√∂r l√§nkar klickbara om de finns
    df = df.copy()  
    if "Wikipedia-l√§nk" in df.columns:
        df["Wikipedia-l√§nk"] = df["Wikipedia-l√§nk"].apply(
            lambda x: f'<a href="{x}" target="_blank">{x}</a>' if pd.notna(x) else ""
        )
    if "Extern l√§nk" in df.columns:
        df["Extern l√§nk"] = df["Extern l√§nk"].apply(
            lambda x: f'<a href="{x}" target="_blank">{x}</a>' if pd.notna(x) else ""
        )

    html_table = df.to_html(
        classes="pivot",
        border=0,
        escape=False,   # viktigt f√∂r klickbara l√§nkar
        index=False
    )
   
    css = """
    <style>
      body {
        font-family: Arial, sans-serif;
        margin: 20px;
      }
      table.pivot {
        border-collapse: collapse;
        width: 100%;
        font-size: 12px;
      }
      table.pivot th, table.pivot td {
        border: 1px solid #999;
        padding: 6px 8px;
        text-align: left;
        vertical-align: top;
        white-space: normal;
      }
      table.pivot th {
        cursor: pointer;
        background: #f2f2f2;
      }
      table.pivot th:hover {
        background: #e2e2e2;
      } 
      table.pivot thead th {
          position: sticky;
          top: 0;
          background: #f2f2f2;
          z-index: 2;
        }
        
        /* Sorting indicators */
        table.pivot th::after {
          content: "";
          float: right;
          opacity: 0.4;
        }
        
        table.pivot th.sorted-asc::after {
          content: " ‚ñ≤";
        }
        
        table.pivot th.sorted-desc::after {
          content: " ‚ñº";
        }
          .meta {
            background: #f8f8f8;
            border: 1px solid #ccc;
            padding: 12px;
            margin-bottom: 20px;
            font-size: 13px;
          }
          .meta h2 {
            margin-top: 0;
          }
          .meta ul {
            margin: 0;
            padding-left: 18px;
          }
    </style>
    """

    js = """
    <script>
    document.addEventListener('DOMContentLoaded', () => {
        document.querySelectorAll("table.pivot th").forEach((header, colIndex) => {
            header.addEventListener("click", () => {
                const table = header.closest("table");
                const tbody = table.querySelector("tbody");
                const rows = Array.from(tbody.querySelectorAll("tr"));
                const asc = !header.classList.contains("sorted-asc");

                rows.sort((a, b) => {
                    const A = a.children[colIndex].innerText.trim();
                    const B = b.children[colIndex].innerText.trim();

                    const numA = parseFloat(A.replace(",", ".")) || null;
                    const numB = parseFloat(B.replace(",", ".")) || null;

                    if (numA !== null && numB !== null) {
                        return asc ? numA - numB : numB - numA;
                    }
                    return asc ? A.localeCompare(B) : B.localeCompare(A);
                });

                table.querySelectorAll("th").forEach(th =>
                    th.classList.remove("sorted-asc","sorted-desc")
                );
                header.classList.add(asc ? "sorted-asc" : "sorted-desc");

                rows.forEach(row => tbody.appendChild(row));
            });
        });
    });
    </script>
    """
    meta_html = f"""
    <div class="meta">
      <h2>Metadata</h2>
    
      <p><strong>Rerun:</strong> {rerun_ts}</p>
      <p><strong>Script:</strong>
         <a href="{SCRIPT_URL}" target="_blank">{SCRIPT_NAME}</a>
      </p>
      <p><strong>Issue:</strong>
         <a href="{issue_url}" target="_blank">{issue_url.split('/')[-1]}</a>
      </p>
    
      <p><strong>Datak√§llor:</strong></p>
      <div class="meta-indent">
        Wikidata<br>
        MediaWiki API ‚Äì exturlusage
      </div>
    
      <p><strong>Parametrar:</strong></p>
      <div class="meta-indent">
        Spr√•kfilter: exkluderar "special"
      </div>
    
      <p><strong>Antal spr√•k genoms√∂kta:</strong> {num_languages_checked}</p>
      <p><strong>Antal spr√•k med tr√§ffar:</strong> {num_languages_found}</p>
      <p><strong>Wikipedia-spr√•k med tr√§ffar:</strong> {", ".join(langs_with_hits)}</p>
      <p><strong>Totala l√§nkar:</strong> {total_links}</p>
      <p><strong>Unika l√§nkar:</strong> {total_unique_links}</p>
    </div>
    """
    
    html = f"""
    <html>
    <head>
      <meta charset="utf-8">
      <title>RA√Ñ links in Wikipedia</title>
      {css}
    </head>
    <body>
      <h1>Wikipedia ‚Üí RA√Ñ</h1>
      {meta_html}
      <p>Sorterbar tabell. Klicka p√• kolumnrubriker f√∂r sortering.</p>
      {html_table}
      {js}
    </body>
    </html>
    """

    out_path.write_text(html, encoding="utf-8")
    print(f"‚úÖ HTML skapad: {out_path}")


In [57]:
save_sortable_html_df_raa(df_raa)


‚úÖ HTML skapad: results/links_raa_2025_11_30.html
