* [#17](https://github.com/salgo60/ifkdb/issues/17) "Vandra igenom alla Kategorier och skapa koppling wdt:P54"


In [1]:
from datetime import datetime
start_time  = datetime.now()
print("Last run: ", start_time) 


Last run:  2026-02-26 08:02:29.601381


In [None]:
get_team_qid_from_category

In [None]:
import requests
import time
import pandas as pd
from IPython.display import display, HTML

WIKI_API = "https://sv.wikipedia.org/w/api.php"
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"

S = requests.Session()
S.headers.update({
    "User-Agent": "Wikipedia_cat/1.0 (https://github.com/salgo60/ifkdb; contact: salgo60@msn.com)"
})

# ---------------------------------------------------
# Hjälpfunktioner
# ---------------------------------------------------

def get_subcategories(category_title):
    members = []
    cmcontinue = None

    while True:
        params = {
            "action": "query",
            "list": "categorymembers",
            "cmtitle": category_title,
            "cmtype": "subcat",
            "cmlimit": "500",
            "format": "json",
            "formatversion": "2"
        }

        if cmcontinue:
            params["cmcontinue"] = cmcontinue

        r = S.get(WIKI_API, params=params)
        r.raise_for_status()
        data = r.json()

        members.extend(data["query"]["categorymembers"])

        if "continue" in data:
            cmcontinue = data["continue"]["cmcontinue"]
            time.sleep(0.3)
        else:
            break

    return [m["title"] for m in members]


def get_category_players(category_title):
    members = []
    gcmcontinue = None

    while True:
        params = {
            "action": "query",
            "generator": "categorymembers",
            "gcmtitle": category_title,
            "gcmlimit": "max",
            "gcmnamespace": 0,
            "prop": "pageprops",
            "format": "json",
            "formatversion": "2"
        }

        if gcmcontinue:
            params["gcmcontinue"] = gcmcontinue

        r = S.get(WIKI_API, params=params)
        r.raise_for_status()
        data = r.json()

        if "query" in data:
            for page in data["query"]["pages"]:
                qid = page.get("pageprops", {}).get("wikibase_item")
                if qid:
                    members.append({
                        "name": page["title"],
                        "qid": qid
                    })

        if "continue" in data:
            gcmcontinue = data["continue"]["gcmcontinue"]
            time.sleep(0.3)
        else:
            break

    return members


def get_team_qid_from_category(category_title):

    params = {
        "action": "query",
        "titles": category_title,
        "prop": "pageprops",
        "format": "json",
        "formatversion": "2"
    }

    r = S.get(WIKI_API, params=params)
    r.raise_for_status()
    data = r.json()

    pages = data.get("query", {}).get("pages", [])
    if not pages or "pageprops" not in pages[0]:
        return None

    category_qid = pages[0]["pageprops"].get("wikibase_item")
    if not category_qid:
        return None

    query = f"""
    SELECT ?club WHERE {{
      wd:{category_qid} wdt:P971 ?club .
      ?club wdt:P31/wdt:P279* wd:Q476028 .
    }}
    """

    headers = {
        "User-Agent": S.headers["User-Agent"],
        "Accept": "application/sparql-results+json"
    }

    r = requests.get(SPARQL_ENDPOINT, params={"query": query}, headers=headers)
    r.raise_for_status()
    data = r.json()

    results = data["results"]["bindings"]
    if results:
        return results[0]["club"]["value"].split("/")[-1]

    return None


def get_players_via_p54(team_qid):

    query = f"""
    SELECT ?player WHERE {{
      ?player wdt:P54 wd:{team_qid}.
    }}
    """

    headers = {
        "User-Agent": S.headers["User-Agent"],
        "Accept": "application/sparql-results+json"
    }

    r = requests.get(SPARQL_ENDPOINT, params={"query": query}, headers=headers)
    r.raise_for_status()
    data = r.json()

    return {
        row["player"]["value"].split("/")[-1]
        for row in data["results"]["bindings"]
    }


# ---------------------------------------------------
# Huvudrapport
# ---------------------------------------------------

def generate_report(main_category):

    from datetime import datetime

    clubs = get_subcategories(main_category)

    total_players = 0
    total_correct = 0
    total_missing = 0

    today_str = datetime.now().strftime("%Y%m%d")
    qs_filename = f"quickstatements_P54_missing_{today_str}.txt"
    report_filename = f"rapport_{today_str}.html"

    quickstatements_lines = []

    html_content = f"""
    <h1>Revisionsrapport</h1>
    """

    # ---------------------------------------------------
    # Gå igenom varje klubb
    # ---------------------------------------------------

    for club_cat in sorted(clubs):

        print("Analyserar:", club_cat)

        team_qid = get_team_qid_from_category(club_cat)
        if not team_qid:
            continue

        wiki_players = get_category_players(club_cat)
        wd_players = get_players_via_p54(team_qid)

        club_rows = []
        club_missing = 0
        club_total = 0

        for player in sorted(wiki_players, key=lambda x: x["name"]):

            club_total += 1
            total_players += 1

            has_p54 = player["qid"] in wd_players

            if has_p54:
                total_correct += 1
            else:
                total_missing += 1
                club_missing += 1
                quickstatements_lines.append(
                    f'{player["qid"]}|P54|{team_qid}'
                )

            lag_url = f"https://sv.wikipedia.org/wiki/{club_cat.replace(' ', '_')}"
            player_url = f"https://sv.wikipedia.org/wiki/{player['name'].replace(' ', '_')}"
            wikidata_url = f"https://www.wikidata.org/wiki/{player['qid']}"
            p54_url = f"{wikidata_url}#P54"

            status_symbol = "✅" if has_p54 else "❌"
            status_text = "Har P54" if has_p54 else "Saknar P54"

            row_color = "" if has_p54 else ' style="background-color:#ffdddd;"'

            club_rows.append(f"""
                <tr{row_color}>
                    <td><a href="{lag_url}" target="_blank">{club_cat.replace("Kategori:", "")}</a></td>
                    <td><a href="{player_url}" target="_blank">{player["name"]}</a></td>
                    <td><a href="{wikidata_url}" target="_blank">{player["qid"]}</a></td>
                    <td>{status_symbol} <a href="{p54_url}" target="_blank">{status_text}</a></td>
                </tr>
            """)

        if club_total == 0:
            continue

        percent_missing = round((club_missing / club_total) * 100, 1)

        # Collapsible block per lag
        html_content += f"""
        <details>
            <summary>
                <strong>{club_cat.replace("Kategori:", "")}</strong>
                – Spelare: {club_total}
                – Saknar P54: {club_missing}
                ({percent_missing}%)
            </summary>

            <table border="1" cellpadding="6" cellspacing="0">
                <tr>
                    <th>Lag</th>
                    <th>Spelare</th>
                    <th>Wikidata</th>
                    <th>P54 status</th>
                </tr>
                {''.join(club_rows)}
            </table>
            <br>
        </details>
        """

        time.sleep(0.2)

    # ---------------------------------------------------
    # Header summering
    # ---------------------------------------------------

    from datetime import datetime
    
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M")
    
    html_header = f"""
    <h1>Revisionsrapport – Fotbollsspelare i klubblag i Sverige</h1>
    
    <p>
    <strong>Rapport skapad:</strong> {timestamp}
    <br>
    <strong>GitHub Issue:</strong>
    <a href="https://github.com/salgo60/ifkdb/issues/17" target="_blank">
    https://github.com/salgo60/ifkdb/issues/17
    </a>
    </p>
    
    <ul>
        <li><strong>Antal lag:</strong> {len(clubs)}</li>
        <li><strong>Antal spelare:</strong> {total_players}</li>
        <li><strong>Med korrekt wdt:P54:</strong> {total_correct}</li>
        <li><strong>Saknar wdt:P54:</strong> {total_missing}</li>
    </ul>
    <hr>
    """
    full_html = html_header + html_content

    # ---------------------------------------------------
    # Spara rapport
    # ---------------------------------------------------

    with open(report_filename, "w", encoding="utf-8") as f:
        f.write(full_html)

    # ---------------------------------------------------
    # Spara QuickStatements
    # ---------------------------------------------------

    quickstatements_lines = sorted(set(quickstatements_lines))

    with open(qs_filename, "w", encoding="utf-8") as f:
        f.write("\n".join(quickstatements_lines))

    print(f"\nRapport sparad: {report_filename}")
    print(f"QuickStatements sparad: {qs_filename}")
    print(f"Antal QuickStatements: {len(quickstatements_lines)}")

    return report_filename, qs_filename


# ---------------------------------------------------
# Kör rapporten
# ---------------------------------------------------

category = "Kategori:Fotbollsspelare_i_klubblag_i_Sverige"
df_report = generate_report(category) 
print (df_report)

Analyserar: Kategori:Fotbollsspelare i AFC Eskilstuna
Analyserar: Kategori:Fotbollsspelare i AFC United
Analyserar: Kategori:Fotbollsspelare i AIK
Analyserar: Kategori:Fotbollsspelare i AIK:s damlag
Analyserar: Kategori:Fotbollsspelare i Ahlafors IF
Analyserar: Kategori:Fotbollsspelare i Akropolis IF
Analyserar: Kategori:Fotbollsspelare i Alingsås IF
Analyserar: Kategori:Fotbollsspelare i Alnö IF
Analyserar: Kategori:Fotbollsspelare i Alvesta GIF
Analyserar: Kategori:Fotbollsspelare i Angered MBIK
Analyserar: Kategori:Fotbollsspelare i Annebergs IF
Analyserar: Kategori:Fotbollsspelare i Arameisk-Syrianska Botkyrka IF
Analyserar: Kategori:Fotbollsspelare i Ariana FC
Analyserar: Kategori:Fotbollsspelare i Asarums IF
Analyserar: Kategori:Fotbollsspelare i Askims IK
Analyserar: Kategori:Fotbollsspelare i Assyriska BK
Analyserar: Kategori:Fotbollsspelare i Assyriska FF
Analyserar: Kategori:Fotbollsspelare i Assyriska IK
Analyserar: Kategori:Fotbollsspelare i Azalea BK
Analyserar: Kategori:F

In [None]:
from datetime import datetime

def generate_quickstatements(df):

    today_str = datetime.now().strftime("%Y%m%d")
    filename = f"quickstatements_P54_missing_{today_str}.txt"

    lines = []

    # Filtrera bara de som saknar P54
    df_missing = df[df["P54 status"] == "❌ Saknar P54"]

    # För varje rad behöver vi:
    # - spelarens QID
    # - klubbens QID

    for _, row in df_missing.iterrows():

        player_qid = row["Wikidata"].split("/")[-1]

        # Hämta klubbens QID igen via kategori
        team_qid = get_team_qid_from_category("Kategori:" + row["Lag"])

        if team_qid:
            lines.append(f"{player_qid}|P54|{team_qid}")

    # Ta bort eventuella dubbletter
    lines = sorted(set(lines))

    with open(filename, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))

    print(f"QuickStatements-fil skapad: {filename}")
    print(f"Antal statements: {len(lines)}")

    return filename


# ---------------------------------------------------
# Kör generering
# ---------------------------------------------------

qs_file = generate_quickstatements(df_report)

In [None]:
end_time = datetime.now()

duration = end_time - start_time

print("\n===== Körningsrapport =====")
print("Starttid :", start_time.strftime("%Y-%m-%d %H:%M:%S"))
print("Sluttid  :", end_time.strftime("%Y-%m-%d %H:%M:%S"))

total_seconds = int(duration.total_seconds())
hours = total_seconds // 3600
minutes = (total_seconds % 3600) // 60
seconds = total_seconds % 60

print(f"Körtid   : {hours}h {minutes}m {seconds}s")
print("===========================\n")