### Checka länkar 
DIGG går i graven och osäkerhet finns hur denna process fungerar
* [#1467](https://github.com/salgo60/DIGG-skuggbacklog/issues/1467)
* Notebook [DIGG_1467_linkroot](https://github.com/salgo60/DIGG-skuggbacklog/tree/master/Notebook/DIGG_1467_linkroot.ipynb)


URLs example  
* https://www.digg.se/ 
* https://webbriktlinjer.se 
* https://beta.webbriktlinjer.se
* https://www.dataportal.se

In [1]:
import time

from datetime import datetime

now = datetime.now()
timestamp = now.timestamp()

start_time = time.time()
print("Start:", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

Start: 2025-11-30 13:37:35


In [2]:
SCRIPT_NAME = "DIGG_1467_linkroot.ipynb"
SCRIPT_URL = (
    "https://github.com/salgo60/DIGG-skuggbacklog/"
    "blob/master/Notebook/DIGG_1467_linkroot.ipynb"
)

In [3]:
def read_domains(file_path):
    print(f"[DEBUG] Reading domains from: {file_path}")
    df = pd.read_csv(file_path, header=0)   # <- skip header row
    domains_list = df.iloc[:, 0].dropna().unique().tolist()
    print(f"[DEBUG] Found {len(domains_list)} domains.")
    return domains_list


In [4]:
import requests

def fetch_sitematrix_df():
    url = "https://meta.wikimedia.org/w/api.php"
    params = {
        "action": "sitematrix",
        "format": "json"
    }
    headers = {
        "User-Agent": "salgo60-language-fetcher/1.0 (salgo60@msn.com)"
    }

    print("[DEBUG] Fetching sitematrix…")
    r = requests.get(url, params=params, headers=headers)
    r.raise_for_status()

    if "application/json" not in r.headers.get("Content-Type", ""):
        raise ValueError("Server returned non-JSON response")

    data = r.json()["sitematrix"]

    rows = []

    # --- language-specific sites ---
    for key, lang_block in data.items():
        if not key.isdigit():
            continue  # skip "count", "specials"

        lang_code = lang_block.get("code")
        lang_name = lang_block.get("name")

        for site in lang_block.get("site", []):
            rows.append({
                "lang_code": lang_code,
                "lang_name": lang_name,
                "project": site.get("project"),
                "url": site.get("url"),
                "dbname": site.get("dbname"),
                "site_name": site.get("sitename"),
                "closed": site.get("closed", False)
            })

    # --- special wikis (Wikidata, Commons, Meta, etc.) ---
    for site in data.get("specials", []):
        rows.append({
            "lang_code": "special",
            "lang_name": "special",
            "project": site.get("project"),
            "url": site.get("url"),
            "dbname": site.get("dbname"),
            "site_name": site.get("sitename"),
            "closed": site.get("closed", False)
        })

    return pd.DataFrame(rows)


In [5]:
import requests
import pandas as pd


HEADERS = {
    "User-Agent": "salgo60-language-fetcher/2.0 (https://github.com/salgo60)"
}


df_lang_fetch = fetch_sitematrix_df()
df_lang_fetch["closed"] = df_lang_fetch["closed"].fillna(False).astype(bool)

df_lang_fetch.head()



[DEBUG] Fetching sitematrix…


Unnamed: 0,lang_code,lang_name,project,url,dbname,site_name,closed
0,aa,Qafár af,,https://aa.wikipedia.org,aawiki,Wikipedia,False
1,aa,Qafár af,,https://aa.wiktionary.org,aawiktionary,Wiktionary,False
2,aa,Qafár af,,https://aa.wikibooks.org,aawikibooks,Wikibooks,False
3,ab,аԥсшәа,,https://ab.wikipedia.org,abwiki,Авикипедиа,False
4,ab,аԥсшәа,,https://ab.wiktionary.org,abwiktionary,Wiktionary,False


In [6]:
df_lang_fetch.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1063 entries, 0 to 1062
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   lang_code  1063 non-null   object
 1   lang_name  1060 non-null   object
 2   project    0 non-null      object
 3   url        1063 non-null   object
 4   dbname     1063 non-null   object
 5   site_name  1063 non-null   object
 6   closed     1063 non-null   bool  
dtypes: bool(1), object(6)
memory usage: 51.0+ KB


In [7]:
df_wiki = df_lang_fetch[df_lang_fetch["site_name"] == "Wikipedia"] 
df_wiki.info()

<class 'pandas.core.frame.DataFrame'>
Index: 207 entries, 0 to 1061
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   lang_code  207 non-null    object
 1   lang_name  206 non-null    object
 2   project    0 non-null      object
 3   url        207 non-null    object
 4   dbname     207 non-null    object
 5   site_name  207 non-null    object
 6   closed     207 non-null    bool  
dtypes: bool(1), object(6)
memory usage: 11.5+ KB


In [8]:
df_wiki.head()

Unnamed: 0,lang_code,lang_name,project,url,dbname,site_name,closed
0,aa,Qafár af,,https://aa.wikipedia.org,aawiki,Wikipedia,False
5,ace,Acèh,,https://ace.wikipedia.org,acewiki,Wikipedia,False
7,af,Afrikaans,,https://af.wikipedia.org,afwiki,Wikipedia,False
11,ak,,,https://ak.wikipedia.org,akwiki,Wikipedia,False
18,ami,Pangcah,,https://ami.wikipedia.org,amiwiki,Wikipedia,False


In [9]:
len(df_wiki)

207

In [10]:
import os

# Get the current working directory
current_directory = os.getcwd()
print("Current Working Directory:", current_directory)



Current Working Directory: /Users/salgo/Documents/GitHub/DIGG-skuggbacklog/Notebook


In [11]:
# -----------------------------------------------------------
# Fetch exturlusage entries for one lang/domain
# -----------------------------------------------------------
def fetch_exturlusage(lang, domain):
    base = f"https://{lang}.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "list": "exturlusage",
        "euquery": domain,
        "eulimit": "max"
    }
    while True:
        r = session.get(base, params=params, timeout=10)
        try:
            data = r.json()
        except ValueError:
            print(f"[WARN] {lang}: JSON decode failed")
            break

        for item in data.get("query", {}).get("exturlusage", []):
            yield {
                "lang": lang,
                "page_title": item.get("title"),
                "url": item.get("url"),
                "wiki_link": f"https://{lang}.wikipedia.org/wiki/{item.get('title').replace(' ', '_')}"
            }

        if "continue" not in data:
            break
        params.update(data["continue"])
        time.sleep(0.3)

In [12]:
import os
import time
import random
import requests
import pandas as pd
from urllib.parse import urlparse
from tqdm.notebook import tqdm
file_path_domain = "sources/domains.csv"
domains = read_domains(file_path_domain)
print(domains)


[DEBUG] Reading domains from: sources/domains.csv
[DEBUG] Found 3 domains.
['digg.se', 'dataportal.se', 'webbriktlinjer.se']


In [None]:
# -------------------------
# Session & helpers
# -------------------------
session = requests.Session()
session.headers.update(
    {"User-Agent": "DIGG-LinkAudit/1.0 salgo60@msn.com"}
)

# Filter out special languages
df_wiki_lang = df_wiki[df_wiki["lang_code"] != "special"]
print("Antal Språk:", len(df_wiki_lang))

results = []

for _, row in df_wiki_lang.iterrows():
    lang = row["lang_code"]
    url = row["url"]
    lang_name = row["lang_name"]

    before = len(results)
    for d in domains:
        #print(lang, url, lang_name, d)

        for entry in fetch_exturlusage(lang, d):
            entry["domain"] = d
            entry["lang_code"] = lang
            entry["lang_name"] = lang_name
            results.append(entry)

    added = len(results) - before
    print(lang, lang_name, "-", added)


Antal Språk: 184
aa Qafár af - 0
ace Acèh - 0
af Afrikaans - 0
ak None - 0
ami Pangcah - 0
an aragonés - 0
ast asturianu - 0
av авар - 0
avk Kotava - 0
ay Aymar aru - 0
bar Boarisch - 0
bbc Batak Toba - 0
bcl Bikol Central - 0
bi Bislama - 0
bm bamanankan - 0
bo བོད་ཡིག - 0
br brezhoneg - 0
bs bosanski - 0


In [None]:
domains

In [None]:
df_digg = pd.DataFrame(results)

In [None]:
len(df_digg)

In [None]:
df_digg

In [None]:
import pandas as pd

# --- Stats ---
total_links = len(df_digg)
total_unique_links = df_digg['url'].nunique()
num_languages = df_digg['lang'].nunique()
langs_sorted = df_digg['lang'].value_counts()

print("Total links:", total_links)
print("Total unique links:", total_unique_links)
print("Number of languages:", num_languages)
print("\nLanguages with most links:")
print(langs_sorted.to_string())


In [None]:
from urllib.parse import urlparse
import pandas as pd

# Extract domain
df_digg['domain'] = df_digg['url'].apply(lambda u: urlparse(u).netloc)

# Count links grouped by domain
links_per_domain = df_digg['domain'].value_counts()

print("Links per domain:")
print(links_per_domain.to_string())


In [None]:
from datetime import date
import os

# Sätt datum
today = date.today().strftime("%Y_%m_%d")

# Se till att katalogen finns
os.makedirs("results", exist_ok=True)

# Bygg filnamn
outfile = f"results/links_digg_{today}.csv"

# Exportera
df_digg.to_csv(outfile, index=False, encoding="utf-8")

print(f"[OK] Exported {len(df_digg)} rows to {outfile}")


In [None]:
# --- Stats ---
total_links = len(df_digg)
total_unique_links = df_digg["url"].nunique()
langs_with_hits = sorted(df_digg["lang"].unique())

num_languages_checked = len(df_wiki)
num_languages_found = len(langs_with_hits)
num_languages_checked = len(df_wiki)        # alla språk som genomsöktes
num_languages_found = df_digg['lang'].nunique()

In [None]:

def get_git_commit():
    try:
        return subprocess.check_output(
            ["git", "rev-parse", "--short", "HEAD"],
            stderr=subprocess.DEVNULL
        ).decode().strip()
    except Exception:
        return "unknown"


In [None]:
GITHUB_REPO = "https://github.com/salgo60/DIGG-skuggbacklog"
commit_hash = get_git_commit()
commit_url = f"{GITHUB_REPO}/commit/{commit_hash}"


In [None]:
script_name = SCRIPT_NAME

In [None]:
script_name

In [None]:
domains


In [None]:
from pathlib import Path
from datetime import date, datetime
import pandas as pd
import subprocess

def save_sortable_html_df_digg(
    df,
    out_dir="results",
    domains=None,
    issue_url="https://github.com/salgo60/DIGG-skuggbacklog/issues/1467",
):
    out_dir = Path(out_dir)
    out_dir.mkdir(exist_ok=True)

    today = date.today().strftime("%Y_%m_%d")
    rerun_ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    out_path = out_dir / f"links_digg_{today}.html"

    # Metadata
    commit_hash = get_git_commit()

    domains = domains or [] 
    

    # Gör länkar klickbara
    df = df.copy()
    if "wiki_link" in df.columns:
        df["wiki_link"] = df["wiki_link"].apply(
            lambda x: f'<a href="{x}" target="_blank">{x}</a>' if pd.notna(x) else ""
        )
    if "url" in df.columns:
        df["url"] = df["url"].apply(
            lambda x: f'<a href="{x}" target="_blank">{x}</a>' if pd.notna(x) else ""
        )

    html_table = df.to_html(
        classes="pivot",
        border=0,
        escape=False,
        index=False
    )

    css = """
    <style>
      body {
        font-family: Arial, sans-serif;
        margin: 20px;
      }
      table.pivot {
        border-collapse: collapse;
        width: 100%;
        font-size: 12px;
      }
      table.pivot th, table.pivot td {
        border: 1px solid #999;
        padding: 6px 8px;
        text-align: left;
        vertical-align: top;
        white-space: normal;
      }
      table.pivot th {
        cursor: pointer;
        background: #f2f2f2;
      }
      table.pivot th:hover {
        background: #e2e2e2;
      }
      .meta {
        background: #f8f8f8;
        border: 1px solid #ccc;
        padding: 12px;
        margin-bottom: 20px;
        font-size: 13px;
      }
      .meta h2 {
        margin-top: 0;
      }
      .meta ul {
        margin: 0;
        padding-left: 18px;
      }
    </style>
    """

    js = """
    <script>
    document.addEventListener('DOMContentLoaded', () => {
        document.querySelectorAll("table.pivot th").forEach((header, colIndex) => {
            header.addEventListener("click", () => {
                const table = header.closest("table");
                const tbody = table.querySelector("tbody");
                const rows = Array.from(tbody.querySelectorAll("tr"));
                const asc = !header.classList.contains("sorted-asc");

                rows.sort((a, b) => {
                    const A = a.children[colIndex].innerText.trim();
                    const B = b.children[colIndex].innerText.trim();

                    const numA = parseFloat(A.replace(",", ".")) || null;
                    const numB = parseFloat(B.replace(",", ".")) || null;

                    if (numA !== null && numB !== null) {
                        return asc ? numA - numB : numB - numA;
                    }
                    return asc ? A.localeCompare(B) : B.localeCompare(A);
                });

                table.querySelectorAll("th").forEach(th =>
                    th.classList.remove("sorted-asc","sorted-desc")
                );
                header.classList.add(asc ? "sorted-asc" : "sorted-desc");

                rows.forEach(row => tbody.appendChild(row));
            });
        });
    });
    </script>
    """

    html = f"""
    <html>
    <head>
      <meta charset="utf-8">
      <title>DIGG links in Wikipedia</title>
      {css}
    </head>
    <body>

      <h1>Wikipedia → DIGG</h1>

      <div class="meta">
        <h2>Metadata</h2>
        <ul>
          <li><strong>Rerun:</strong> {rerun_ts}</li>
          <li>
              <strong>Script:</strong>
              <a href="{SCRIPT_URL}" target="_blank">
                {SCRIPT_NAME} </a>
        </li>
          <li>
              <strong>Git commit:</strong>
              <a href="{commit_url}" target="_blank">{commit_hash}</a>
          </li>
          <li><strong>Issue:</strong>
            <a href="{issue_url}" target="_blank">{issue_url.split('/')[-1]}</a>
          </li>
          <li><strong>Datakällor:</strong>
            <ul>
              <li>Wikidata</li>
              <li>MediaWiki API – exturlusage</li>
            </ul>
          </li>
          <li><strong>Parametrar:</strong>
            <ul>
              <li>Språkfilter: exkluderar "special"</li>
              </ul>

          </li>
              <li><strong>Antal språk genomsökta:</strong> {num_languages_checked}</li>
              <li><strong>Antal språk med träffar:</strong> {num_languages_found}</li>
              <li><strong>Wikipedia-språk med träffar:</strong> {", ".join(langs_with_hits)}</li>
                
              <li><strong>Totala länkar:</strong> {total_links}</li>
              <li><strong>Unika länkar:</strong> {total_unique_links}</li>

        </ul>
      </div>

      <p>Sorterbar tabell. Klicka på kolumnrubriker för sortering.</p>

      {html_table}

      {js}
    </body>
    </html>
    """

    out_path.write_text(html, encoding="utf-8")
    print(f"✅ HTML skapad: {out_path}")


In [None]:
save_sortable_html_df_digg(df_digg)


In [None]:
 # End timer and calculate duration
end_time = time.time()
elapsed_time = end_time - start_time# Bygg audit-lager för den här etappen

# Print current date and total time
print("Date:", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
minutes, seconds = divmod(elapsed_time, 60)
print("Total time elapsed: {:02.0f} minutes {:05.2f} seconds".format(minutes, seconds))
