### snokadb.se
* [#13](https://github.com/salgo60/ifkdb/issues/13)
* Notebook [13_snokadb_se](https://github.com/salgo60/ifkdb/blob/main/Notebook/13_snokadb_se.ipynb)

In [1]:
from datetime import datetime
start_time  = datetime.now()
print("Last run: ", start_time)

Last run:  2026-02-23 12:30:06.221648


In [2]:
import nest_asyncio
nest_asyncio.apply()

In [3]:
import asyncio
import re
import pandas as pd
from playwright.async_api import async_playwright

async def scrape_players():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto("https://snokadb.se/spelare", timeout=60000)
        await page.wait_for_selector("table")

        rows = await page.query_selector_all("table tbody tr")
        data = []

        for row in rows:
            cols = await row.query_selector_all("td")
            if len(cols) < 4:
                continue

            raw_name = await cols[0].inner_text()
            name = re.sub(r"[^\w\s\-\"]", "", raw_name).strip()

            seasons = (await cols[1].inner_text()).strip()
            matches = (await cols[2].inner_text()).strip()
            goals = (await cols[3].inner_text()).strip()

            link = await row.query_selector("a")
            href = await link.get_attribute("href") if link else ""
            slug = href.split("/")[-1] if href else ""

            data.append({
                "snokadb_id": slug,
                "name": name,
                "seasons": seasons,
                "matches": matches,
                "goals": goals
            })

        await browser.close()

    return pd.DataFrame(data)

df = asyncio.run(scrape_players())
df.head(), len(df)

(           snokadb_id                name                 seasons matches  \
 0      abdoulie-tamba      Abdoulie Tamba                    2025       5   
 1  alexander-fransson  Alexander Fransson     2013 och 7 säsonger     201   
 2     amadeus-sogaard     Amadeus Sögaard              2024, 2025      34   
 3      anton-eriksson      Anton Eriksson  2022, 2023, 2024, 2025      79   
 4    arnor-traustason    Arnor Traustason     2014 och 6 säsonger     151   
 
   goals  
 0     0  
 1    17  
 2     0  
 3     0  
 4    35  ,
 586)

In [4]:
import asyncio
from playwright.async_api import async_playwright

async def enrich_birthdates(df):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()

        birthdates = []

        for slug in df["snokadb_id"]:
            url = f"https://snokadb.se/spelare/{slug}"

            try:
                await page.goto(url, timeout=60000)
                
                # Vänta tills React är färdig
                await page.wait_for_load_state("networkidle")

                # Vänta kortare på table (inte 30s)
                await page.wait_for_selector("table", timeout=5000)

                rows = await page.query_selector_all("table tr")
                birthdate = ""

                for row in rows:
                    text = await row.inner_text()
                    if "Födelsedatum" in text:
                        tds = await row.query_selector_all("td")
                        birthdate = (await tds[0].inner_text()).strip()
                        break

            except Exception as e:
                print(f"Misslyckades för {slug}")
                birthdate = ""

            birthdates.append(birthdate)

        await browser.close()

    df["birthdate"] = birthdates
    return df

In [5]:
df.to_csv("snokadb_players_raw.csv", index=False)

In [6]:
df

Unnamed: 0,snokadb_id,name,seasons,matches,goals
0,abdoulie-tamba,Abdoulie Tamba,2025,5,0
1,alexander-fransson,Alexander Fransson,2013 och 7 säsonger,201,17
2,amadeus-sogaard,Amadeus Sögaard,"2024, 2025",34,0
3,anton-eriksson,Anton Eriksson,"2022, 2023, 2024, 2025",79,0
4,arnor-traustason,Arnor Traustason,2014 och 6 säsonger,151,35
...,...,...,...,...,...
581,ake-brynje,Åke Brynje,1939/40,1,0
582,ake-johansson,Åke Johansson,1949/50 och 15 säsonger,321,1
583,ake-larsson,Åke Larsson,"1939/40, 1940/41",7,0
584,ake-skarin,Åke Skarin,1939/40,2,0


In [7]:
# Wikipedia/WIkidata  
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd

sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

query = """
SELECT ?item ?itemLabel ?birthdate ?svwiki WHERE {
  ?item wdt:P54 wd:Q329607
  OPTIONAL { ?item wdt:P569 ?birthdate }
  OPTIONAL {
    ?svwiki schema:about ?item ;
            schema:isPartOf <https://sv.wikipedia.org/> .
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "sv,en". }
}
"""

sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

rows = []
for r in results["results"]["bindings"]:
    rows.append({
        "wikidata_qid": r["item"]["value"].split("/")[-1],
        "name_wd": r["itemLabel"]["value"],
        "birthdate_wd": r.get("birthdate", {}).get("value", ""),
        "wikipedia_sv": r.get("svwiki", {}).get("value", "")
    })

wd_df = pd.DataFrame(rows)
wd_df.head(), len(wd_df)

(  wikidata_qid             name_wd          birthdate_wd wikipedia_sv
 0      Q308080  Abderrahman Kabous  1983-04-24T00:00:00Z             
 1      Q576245    Kyle Lightbourne  1968-09-29T00:00:00Z             
 2      Q629453    Slobodan Marović  1964-07-13T00:00:00Z             
 3      Q634973      Janne Hietanen  1978-06-02T00:00:00Z             
 4      Q724829     Balázs Rabóczki  1978-01-09T00:00:00Z             ,
 209)

In [8]:
# Matcha mot snokadb  
merged = df.merge(
    wd_df,
    left_on="name",
    right_on="name_wd",
    how="left"
)

merged[["snokadb_id","name","wikidata_qid","wikipedia_sv"]].head()

Unnamed: 0,snokadb_id,name,wikidata_qid,wikipedia_sv
0,abdoulie-tamba,Abdoulie Tamba,,
1,alexander-fransson,Alexander Fransson,,
2,amadeus-sogaard,Amadeus Sögaard,,
3,anton-eriksson,Anton Eriksson,,
4,arnor-traustason,Arnor Traustason,,


In [9]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 592 entries, 0 to 591
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   snokadb_id    592 non-null    object
 1   name          592 non-null    object
 2   seasons       592 non-null    object
 3   matches       592 non-null    object
 4   goals         592 non-null    object
 5   wikidata_qid  150 non-null    object
 6   name_wd       150 non-null    object
 7   birthdate_wd  150 non-null    object
 8   wikipedia_sv  150 non-null    object
dtypes: object(9)
memory usage: 41.8+ KB


In [10]:
def normalize(n):
    return n.lower().replace('"','').strip()

df["name_norm"] = df["name"].apply(normalize)
wd_df["name_norm"] = wd_df["name_wd"].apply(normalize)

merged = df.merge(
    wd_df,
    on="name_norm",
    how="left"
)

In [11]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 592 entries, 0 to 591
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   snokadb_id    592 non-null    object
 1   name          592 non-null    object
 2   seasons       592 non-null    object
 3   matches       592 non-null    object
 4   goals         592 non-null    object
 5   name_norm     592 non-null    object
 6   wikidata_qid  150 non-null    object
 7   name_wd       150 non-null    object
 8   birthdate_wd  150 non-null    object
 9   wikipedia_sv  150 non-null    object
dtypes: object(10)
memory usage: 46.4+ KB


In [12]:
final = merged[[
    "snokadb_id",
    "name",
    "wikidata_qid",
    "wikipedia_sv",
    "birthdate_wd"
]]

final.head()

Unnamed: 0,snokadb_id,name,wikidata_qid,wikipedia_sv,birthdate_wd
0,abdoulie-tamba,Abdoulie Tamba,,,
1,alexander-fransson,Alexander Fransson,,,
2,amadeus-sogaard,Amadeus Sögaard,,,
3,anton-eriksson,Anton Eriksson,,,
4,arnor-traustason,Arnor Traustason,,,


In [13]:
final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 592 entries, 0 to 591
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   snokadb_id    592 non-null    object
 1   name          592 non-null    object
 2   wikidata_qid  150 non-null    object
 3   wikipedia_sv  150 non-null    object
 4   birthdate_wd  150 non-null    object
dtypes: object(5)
memory usage: 23.3+ KB


### Wikipedia / Wikidata 
* [Kategorin har 283](https://sv.wikipedia.org/w/index.php?title=Kategori:Fotbollsspelare_i_IFK_Norrk%C3%B6ping&pagefrom=Nilsson%2C+David+Mitov%0ADavid+Mitov+Nilsson#mw-pages)
* [Wikidata 202](https://w.wiki/Hx5x)

In [14]:
import requests
import time

S = requests.Session()
S.headers.update({
    "User-Agent": "IFK-POC/1.0 (https://github.com/yourrepo; contact: your@email)"
})

URL = "https://sv.wikipedia.org/w/api.php"

def get_category_members():
    members = []
    params = {
        "action": "query",
        "list": "categorymembers",
        "cmtitle": "Kategori:Fotbollsspelare i IFK Norrköping",
        "cmlimit": "500",
        "format": "json",
        "formatversion": "2"
    }

    while True:
        r = S.get(URL, params=params, timeout=30)
        r.raise_for_status()

        # debug om något går fel
        if "application/json" not in r.headers.get("Content-Type", ""):
            print("Fick inte JSON:")
            print(r.text[:500])
            break

        data = r.json()

        members.extend(data["query"]["categorymembers"])

        if "continue" in data:
            params.update(data["continue"])
            time.sleep(0.5)  # snäll mot API:t
        else:
            break

    return [m["title"] for m in members]

titles = get_category_members()
len(titles)

294

In [15]:
titles

['Ishaq Abdulrazak',
 'Samuel Adegbenro',
 'Viktor Agardius',
 'Enis Ahmetovic',
 'Astrit Ajdarević',
 'Ernst Albihn',
 'Anders Alé',
 'Albert Aleksanjan',
 'Andreas Alm',
 'Pontus Almqvist',
 'Kevin Álvarez',
 'Stefan Alvén',
 'Kevin Amuneke',
 'David Andersson (fotbollsspelare)',
 'Kennet Andersson',
 'Leif Andersson (fotbollsspelare född 1952)',
 'Sebastian Andersson',
 'Sune Andersson (fotbollsspelare)',
 'Åke Andersson (fotbollsspelare född 2007)',
 'Andri Guðjohnsen',
 'Filip Apelstav',
 'Ari Freyr Skúlason',
 'Arnór Ingvi Traustason',
 'Arnór Sigurðsson',
 'Kristoffer Arvhage',
 'Sven Axbom',
 'Lennart Backman',
 'Nuredin Bakiu',
 'Daniel Bamberg',
 'Modou Barrow',
 'Sven-Olof "Berka" Bergman',
 'Göran Bergort',
 'Kristian Bergström',
 'Fredrik Bild',
 'Harry Bild',
 'Egzon Binaku',
 'Jóhannes Kristinn Bjarnason',
 'Jonas Bjurström',
 'Carl Björk (fotbollsspelare född 2000)',
 'Pelle Blohm',
 'Mikael Blomberg',
 'Andreas Blomqvist',
 'Stephen Bolma',
 'David Boo Wiklander',
 'Je

In [16]:
import requests
import time
import pandas as pd

WD_API = "https://www.wikidata.org/w/api.php"

session = requests.Session()
session.headers.update({
    "User-Agent": "IFK-Norrkoping-POC/1.0 (https://github.com/dittrepo; kontakt: salgö60@msn.com)"
})

In [17]:
# slå upp titles 
import requests
import pandas as pd
import time


def lookup_titles_in_batches(titles, batch_size=50):
    results = []

    for i in range(0, len(titles), batch_size):
        batch = titles[i:i+batch_size]
        titles_str = "|".join(batch)

        params = {
            "action": "wbgetentities",
            "sites": "svwiki",
            "titles": titles_str,
            "props": "claims|sitelinks",
            "redirects": "yes",
            "format": "json"
        }
        r = session.get(WD_API, params=params)
        r.raise_for_status()
        data = r.json()

        entities = data.get("entities", {})

        for title in batch:
            qid = None
            birthdate = None
            has_p54 = False

            # hitta rätt entity via sitelink
            for entity in entities.values():
                if entity.get("sitelinks", {}).get("svwiki", {}).get("title") == title:
                    qid = entity.get("id")
                    claims = entity.get("claims", {})

                    # birthdate
                    if "P569" in claims:
                        birthdate = claims["P569"][0]["mainsnak"]["datavalue"]["value"]["time"]

                    # P54 IFK
                    if "P54" in claims:
                        for c in claims["P54"]:
                            target = c["mainsnak"]["datavalue"]["value"]["id"]
                            if target == "Q329607":  # IFK Norrköping
                                has_p54 = True
                    break

            results.append({
                "wikipedia_sv": title,
                "wikidata_qid": qid,
                "birthdate_wd": birthdate,
                "has_P54_IFK": has_p54
            })

        time.sleep(0.5)  # var snäll mot API:t

    return pd.DataFrame(results)

df_wiki = lookup_titles_in_batches(titles)
df_wiki.head() 


Unnamed: 0,wikipedia_sv,wikidata_qid,birthdate_wd,has_P54_IFK
0,Ishaq Abdulrazak,Q96361469,+2002-05-05T00:00:00Z,True
1,Samuel Adegbenro,Q19152611,+1995-12-03T00:00:00Z,False
2,Viktor Agardius,Q5543358,+1989-10-23T00:00:00Z,False
3,Enis Ahmetovic,Q19787867,+1992-01-26T00:00:00Z,False
4,Astrit Ajdarević,Q751101,+1990-04-17T00:00:00Z,True


In [18]:
df_wiki.to_csv("ifkwikipedia.csv")

In [19]:
df_wiki["has_P54_IFK"].value_counts()

has_P54_IFK
False    151
True     143
Name: count, dtype: int64

In [20]:
df_wiki.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   wikipedia_sv  294 non-null    object
 1   wikidata_qid  294 non-null    object
 2   birthdate_wd  294 non-null    object
 3   has_P54_IFK   294 non-null    bool  
dtypes: bool(1), object(3)
memory usage: 7.3+ KB


## Status rapport

In [21]:
import pandas as pd

# Wikipedia-länk
from urllib.parse import quote

df_wiki["wikipedia_url"] = df_wiki["wikipedia_sv"].apply(
    lambda x: "https://sv.wikipedia.org/wiki/" + quote(x.replace(" ", "_"))
)
# Wikidata-länk
df_wiki["wikidata_url"] = "https://www.wikidata.org/wiki/" + df_wiki["wikidata_qid"]

# SnokaDb-länk (matcha på namn)
name_to_slug = dict(zip(df["name"], df["snokadb_id"]))
df_wiki["snokadb_id"] = df_wiki["wikipedia_sv"].map(name_to_slug)
df_wiki["snokadb_url"] = "https://snokadb.se/spelare/" + df_wiki["snokadb_id"].fillna("")

In [22]:
total_snokadb = len(df)
total_wikipedia = len(df_wiki)
total_p54 = df_wiki["has_P54_IFK"].sum()
missing_p54 = total_wikipedia - total_p54

coverage_percent = round((total_p54 / total_wikipedia) * 100, 1)

In [23]:
bois_set = set([p["name"] for p in bois_clean])
wiki_set = set(wiki_clean)

wiki_exact = bois_set & wiki_set
len(wiki_exact)

NameError: name 'bois_clean' is not defined

In [None]:
# ================================
# BOIS STATUSRAPPORT – FULL POC
# ================================

# --- Säkerställ BoIS-set ---
bois_set = set([p["name"] for p in bois_clean])

# --- Wikidata P54 data ---
wd_qids = set([w["qid"] for w in wd_players])
wd_name_to_qid = {w["name"]: w["qid"] for w in wd_players}

# --- Wikipedia data ---
wiki_set = set(wiki_titles)

# Matcha Wikipedia → Wikidata via QID
wiki_with_p54 = []
wiki_without_p54 = []

for title in wiki_titles:
    qid = wiki_qids.get(title)
    if qid and qid in wd_qids:
        wiki_with_p54.append(title)
    else:
        wiki_without_p54.append(title)

# --- Nyckeltal ---
total_bois = len(bois_set)
total_wiki = len(wiki_titles)
total_p54 = len(wd_qids)
missing_p54 = len(wiki_without_p54)
coverage_percent = round((total_p54 / total_bois) * 100, 1)

# --- Bygg rapport ---
report = f"""
# Statusrapport Landskrona BoIS – Wikipedia / Wikidata / BoIS-historia

Klubb: Landskrona BoIS  
Wikipedia-kategori: {total_wiki} spelare  
BoIS-historia: {total_bois} spelare  

## Wikidata-struktur (P54 → Landskrona BoIS)

- Har korrekt P54: {total_p54}
- Wikipedia-artiklar utan P54: {missing_p54}
- Total täckning (BoIS → WD): {coverage_percent} %

---

## Wikipedia-kategori – Identifierarstatus

| Spelare | Wikidata | P54 |
|---------|----------|------|
"""

# Sortera så ❌ kommer först
sorted_titles = sorted(
    wiki_titles,
    key=lambda t: t in wiki_with_p54
)

for title in sorted_titles:
    wiki_link = f"[{title}](https://sv.wikipedia.org/wiki/{title.replace(' ', '_')})"
    
    qid = wiki_qids.get(title)
    
    if qid:
        wd_link = f"[{qid}](https://www.wikidata.org/wiki/{qid})"
        has_p54 = qid in wd_qids
    else:
        wd_link = "–"
        has_p54 = False
    
    status = "✅" if has_p54 else "❌"
    
    report += f"| {wiki_link} | {wd_link} | {status} |\n"



In [None]:
# ================================
# HÄMTA WIKIDATA P54 – BOIS
# ================================

from SPARQLWrapper import SPARQLWrapper, JSON

sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

query = """
SELECT ?item ?itemLabel WHERE {
  ?item wdt:P54 wd:Q863273 .
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "sv,en,mul" .
  }
}
"""

sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

wd_players = []

for r in results["results"]["bindings"]:
    wd_players.append({
        "qid": r["item"]["value"].split("/")[-1],
        "name": r["itemLabel"]["value"]
    })

print("Wikidata P54 spelare:", len(wd_players))

In [None]:
# ================================
# BOIS STATUSRAPPORT – STABIL POC
# ================================

import re

# --- 1. Normalisera BoIS-namn (utan beroende av bois_clean) ---

def normalize_bois(name):
    # Ta bort smeknamn i citattecken
    name = re.sub(r'"[^"]+"', '', name)
    # Ta bort parenteser
    name = re.sub(r'\([^)]*\)', '', name)
    # Ta bort ev årtal i slutet
    name = re.sub(r'\b\d{2,4}\b$', '', name)
    name = name.strip()
    
    # Invertera "Efternamn, Förnamn"
    if "," in name:
        last, first = name.split(",", 1)
        name = first.strip() + " " + last.strip()
    
    return " ".join(name.split())

bois_set = set([normalize_bois(p["name"]) for p in players])

# --- 2. Wikidata P54 data ---

wd_qids = set([w["qid"] for w in wd_players])

# --- 3. Wikipedia data ---

wiki_set = set(wiki_titles)

# Wikipedia → Wikidata via QID
wiki_with_p54 = []
wiki_without_p54 = []

for title in wiki_titles:
    qid = wiki_qids.get(title)
    if qid and qid in wd_qids:
        wiki_with_p54.append(title)
    else:
        wiki_without_p54.append(title)

# --- 4. Nyckeltal ---

total_bois = len(bois_set)
total_wiki = len(wiki_titles)
total_p54 = len(wd_qids)
missing_p54 = len(wiki_without_p54)
coverage_percent = round((total_p54 / total_bois) * 100, 1)

# --- 5. Bygg rapport ---

report = f"""
# Statusrapport Landskrona BoIS – Wikipedia / Wikidata / BoIS-historia

Klubb: Landskrona BoIS  
Wikipedia-kategori: {total_wiki} spelare  
BoIS-historia: {total_bois} spelare  

## Wikidata-struktur (P54 → Landskrona BoIS)

- Har korrekt P54: {total_p54}
- Wikipedia-artiklar utan P54: {missing_p54}
- Total täckning (BoIS → WD): {coverage_percent} %

---

## Wikipedia-kategori – Identifierarstatus

| Spelare | Wikidata | P54 |
|---------|----------|------|
"""

# Sortera så ❌ visas först
sorted_titles = sorted(
    wiki_titles,
    key=lambda t: t in wiki_with_p54
)

for title in sorted_titles:
    wiki_link = f"[{title}](https://sv.wikipedia.org/wiki/{title.replace(' ', '_')})"
    
    qid = wiki_qids.get(title)
    
    if qid:
        wd_link = f"[{qid}](https://www.wikidata.org/wiki/{qid})"
        has_p54 = qid in wd_qids
    else:
        wd_link = "–"
        has_p54 = False
    
    status = "✅" if has_p54 else "❌"
    
    report += f"| {wiki_link} | {wd_link} | {status} |\n"

print(report)

In [2]:
from datetime import datetime
end_time = datetime.now()

duration = end_time - start_time

print("\n===== Körningsrapport =====")
print("Starttid :", start_time.strftime("%Y-%m-%d %H:%M:%S"))
print("Sluttid  :", end_time.strftime("%Y-%m-%d %H:%M:%S"))

total_seconds = int(duration.total_seconds())
hours = total_seconds // 3600
minutes = (total_seconds % 3600) // 60
seconds = total_seconds % 60

print(f"Körtid   : {hours}h {minutes}m {seconds}s")
print("===========================\n")

NameError: name 'start_time' is not defined